import pathlib import os import argparse from typing import List import pandas as pd import hashlib from datetime import datetime import csv from pandas.core.series import Series from media import Media def find_all_files(directory: pathlib.Path) -> list: ps = [] for p in directory.rglob('*'): if "eaDir" not in str(p): ps.append((p)) return ps def view_unqiue_files(p: pathlib.Path, min_size): if p.lstat().st_size > min_size * 1024 * 1024: if p.lstat().st_nlink == 1: print("File with only one hardlink:" + str(p) + "; size: " + str(p.lstat().st_size/1024/1024) + "MB with number of links: " + str(p.lstat().st_nlink)) else: print("File with more than one hardlink:" + str(p) + "; size: " + str(p.lstat().st_size/1024/1024) + "MB with number of links: " + str(p.lstat().st_nlink)) def get_file_checksum(p: pathlib.Path): m = hashlib.md5() with open(p, 'rb') as f: while chunk := f.read(8192): m.update(chunk) hexvalue = m.hexdigest() print(hexvalue) return hexvalue if __name__ == "__main__": parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder') parser.add_argument('-s','--source', type=str, help='the path to directory with source files') parser.add_argument('-t', '--target', type=str, help='the path to directory to move files') parser.add_argument('--size', type=int, help='Expected minimum file size') parser.add_argument('--csv', type=str, choices=['False','True'], help='export source folders unique files into csv') parser.add_argument('--order', type=str, help='order exported csv by name or inode', choices=['inode','name','checksum']) parser.add_argument('--csv_path', type=str, help='path to export csv') parser.add_argument('--move', type=str, help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory', choices=['False','True']) args = parser.parse_args() min_file_size = 50 path_library = args.source path_inventory = args.target if args.size: min_file_size = args.size paths_library = find_all_files(pathlib.Path(path_library)) if args.csv == "True": csv_path = "result.csv" df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum']) for x in paths_library: if x.lstat().st_size > min_file_size * 1024 * 1024: print(str(datetime.now()) + " : " + str(x)) new_row = {'FileName': x.name, 'Path': str(x), 'inode': x.lstat().st_ino, 'NumberOfLinks': x.lstat().st_nlink, 'CheckSum': get_file_checksum(x)} df_csv = df_csv.append(new_row, ignore_index=True) if args.order == "inode": df_csv = df_csv.sort_values(by="inode") elif args.order == "name": df_csv = df_csv.sort_values(by="FileName") elif args.order == "checksum": df_csv = df_csv.sort_values(by="CheckSum") if args.csv_path: csv_path = args.csv_path df_csv.to_csv(csv_path,index_label='Index') else: for x in paths_library: print(str(x)) view_unqiue_files(x, min_file_size) if args.move == "True": target_dir = pathlib.Path(path_inventory) for x in paths_library: if x.lstat().st_size > min_file_size * 1024 * 1024: print(str(datetime.now()) + " : " + str(x)) new_path = pathlib.Path(target_dir, x.name).resolve() print("New path: " + str(new_path)) x.rename(new_path)