2021-08-14 21:45:26 +10:00
|
|
|
import pathlib
|
|
|
|
import os
|
|
|
|
import argparse
|
2021-08-15 00:13:02 +10:00
|
|
|
from typing import List
|
|
|
|
import pandas as pd
|
2021-08-15 13:11:59 +10:00
|
|
|
import hashlib
|
2021-08-15 15:55:29 +10:00
|
|
|
from datetime import datetime
|
2021-08-15 00:13:02 +10:00
|
|
|
import csv
|
|
|
|
|
|
|
|
from pandas.core.series import Series
|
2021-08-14 21:45:26 +10:00
|
|
|
|
|
|
|
from media import Media
|
|
|
|
|
2021-08-15 00:13:02 +10:00
|
|
|
def find_all_files(directory: pathlib.Path) -> list:
|
2021-08-14 21:45:26 +10:00
|
|
|
ps = []
|
|
|
|
for p in directory.rglob('*'):
|
|
|
|
if "eaDir" not in str(p):
|
|
|
|
ps.append((p))
|
|
|
|
return ps
|
|
|
|
|
2021-08-15 00:13:02 +10:00
|
|
|
def view_unqiue_files(p: pathlib.Path, min_size):
|
|
|
|
if p.lstat().st_size > min_size * 1024 * 1024:
|
|
|
|
if p.lstat().st_nlink == 1:
|
|
|
|
print("File with only one hardlink:" + str(p) + "; size: " +
|
|
|
|
str(p.lstat().st_size/1024/1024) +
|
|
|
|
"MB with number of links: " + str(p.lstat().st_nlink))
|
|
|
|
else:
|
|
|
|
print("File with more than one hardlink:" + str(p) + "; size: " +
|
|
|
|
str(p.lstat().st_size/1024/1024) +
|
|
|
|
"MB with number of links: " + str(p.lstat().st_nlink))
|
|
|
|
|
2021-08-15 13:11:59 +10:00
|
|
|
def get_file_checksum(p: pathlib.Path):
|
|
|
|
m = hashlib.md5()
|
|
|
|
with open(p, 'rb') as f:
|
|
|
|
while chunk := f.read(8192):
|
|
|
|
m.update(chunk)
|
2021-08-15 15:55:29 +10:00
|
|
|
hexvalue = m.hexdigest()
|
|
|
|
print(hexvalue)
|
|
|
|
return hexvalue
|
2021-08-15 13:11:59 +10:00
|
|
|
|
2021-08-14 21:45:26 +10:00
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')
|
|
|
|
|
2021-08-15 00:13:02 +10:00
|
|
|
parser.add_argument('-s','--source',
|
|
|
|
type=str,
|
|
|
|
help='the path to directory with source files')
|
|
|
|
|
|
|
|
parser.add_argument('-t', '--target',
|
2021-08-14 21:45:26 +10:00
|
|
|
type=str,
|
2021-08-15 00:13:02 +10:00
|
|
|
help='the path to directory to move files')
|
2021-08-14 21:45:26 +10:00
|
|
|
|
2021-08-15 00:13:02 +10:00
|
|
|
parser.add_argument('--size',
|
|
|
|
type=int,
|
|
|
|
help='Expected minimum file size')
|
|
|
|
|
|
|
|
parser.add_argument('--csv',
|
2021-08-14 21:45:26 +10:00
|
|
|
type=str,
|
2021-08-15 00:13:02 +10:00
|
|
|
choices=['False','True'],
|
|
|
|
help='export source folders unique files into csv')
|
|
|
|
|
|
|
|
parser.add_argument('--order',
|
|
|
|
type=str,
|
|
|
|
help='order exported csv by name or inode',
|
2021-08-15 13:11:59 +10:00
|
|
|
choices=['inode','name','checksum'])
|
2021-08-15 00:13:02 +10:00
|
|
|
|
|
|
|
parser.add_argument('--csv_path',
|
|
|
|
type=str,
|
|
|
|
help='path to export csv')
|
|
|
|
|
|
|
|
parser.add_argument('--move',
|
|
|
|
type=str,
|
|
|
|
help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory')
|
2021-08-14 21:45:26 +10:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
2021-08-15 00:13:02 +10:00
|
|
|
min_file_size = 50
|
|
|
|
path_library = args.source
|
|
|
|
path_inventory = args.target
|
2021-08-14 21:45:26 +10:00
|
|
|
|
2021-08-15 00:13:02 +10:00
|
|
|
if args.size:
|
|
|
|
min_file_size = args.size
|
|
|
|
paths = find_all_files(pathlib.Path(path_library))
|
|
|
|
|
|
|
|
if args.csv == "True":
|
|
|
|
csv_path = "result.csv"
|
2021-08-15 13:11:59 +10:00
|
|
|
df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])
|
2021-08-15 00:13:02 +10:00
|
|
|
for x in paths:
|
|
|
|
if x.lstat().st_size > min_file_size * 1024 * 1024:
|
2021-08-15 15:55:29 +10:00
|
|
|
print(str(datetime.now()) + " : " + str(x))
|
2021-08-15 00:13:02 +10:00
|
|
|
new_row = {'FileName': x.name,
|
|
|
|
'Path': str(x),
|
|
|
|
'inode': x.lstat().st_ino,
|
2021-08-15 13:11:59 +10:00
|
|
|
'NumberOfLinks': x.lstat().st_nlink,
|
|
|
|
'CheckSum': get_file_checksum(x)}
|
2021-08-15 00:13:02 +10:00
|
|
|
df_csv = df_csv.append(new_row, ignore_index=True)
|
|
|
|
if args.order == "inode":
|
2021-08-15 11:25:14 +10:00
|
|
|
df_csv = df_csv.sort_values(by="inode")
|
|
|
|
elif args.order == "name":
|
|
|
|
df_csv = df_csv.sort_values(by="FileName")
|
2021-08-15 13:11:59 +10:00
|
|
|
elif args.order == "checksum":
|
|
|
|
df_csv = df_csv.sort_values(by="CheckSum")
|
2021-08-15 00:13:02 +10:00
|
|
|
|
|
|
|
if args.csv_path:
|
|
|
|
csv_path = args.csv_path
|
2021-08-15 19:33:40 +10:00
|
|
|
df_csv.to_csv(csv_path,index_label='Index')
|
2021-08-15 00:13:02 +10:00
|
|
|
else:
|
|
|
|
for x in paths:
|
2021-08-15 13:13:31 +10:00
|
|
|
print(str(x))
|
2021-08-15 00:13:02 +10:00
|
|
|
view_unqiue_files(x, min_file_size)
|
2021-08-14 21:45:26 +10:00
|
|
|
|
|
|
|
# dst_path = pathlib.Path(os.path.join(path_inventory,x.name))
|
|
|
|
# print("Its new path: " + str(dst_path))
|
|
|
|
# x.link_to(dst_path)
|