diff --git a/mv_unique_files.py b/mv_unique_files.py index a401704..87ab78f 100644 --- a/mv_unique_files.py +++ b/mv_unique_files.py @@ -35,9 +35,31 @@ def get_file_checksum(p: pathlib.Path): while chunk := f.read(8192): m.update(chunk) hexvalue = m.hexdigest() - print(hexvalue) return hexvalue +def get_file_dataframe(ps, min_file_size, bool_checksum: bool) -> pd.DataFrame: + if bool_checksum == True: + df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum']) + for p in ps: + if p.lstat().st_size > min_file_size * 1024 * 1024: + new_row = {'FileName': p.name, + 'Path': str(p), + 'inode': p.lstat().st_ino, + 'NumberOfLinks': p.lstat().st_nlink, + 'CheckSum': get_file_checksum(p)} + df = df.append(new_row, ignore_index=True) + return df + else: + df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks']) + for p in ps: + if p.lstat().st_size > min_file_size * 1024 * 1024: + new_row = {'FileName': p.name, + 'Path': str(p), + 'inode': p.lstat().st_ino, + 'NumberOfLinks': p.lstat().st_nlink} + df = df.append(new_row, ignore_index=True) + return df + if __name__ == "__main__": parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder') @@ -72,27 +94,31 @@ if __name__ == "__main__": help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory', choices=['False','True']) + parser.add_argument('--hardlink', + type=str, + help='Whether copy files in source directory (via hardlink) to target directory', + choices=['True','False']) + + parser.add_argument('--unique', + type=str, + help='Wether the copy file is unqiue in target directory', + choices=['True','False']) + args = parser.parse_args() min_file_size = 50 - path_library = args.source - path_inventory = args.target + if args.source: + path_source = args.source + if args.target: + path_target = args.target if args.size: min_file_size = args.size - paths_library = find_all_files(pathlib.Path(path_library)) + paths_source = find_all_files(pathlib.Path(path_source)) if args.csv == "True": + # Export csv of source directory to csv_path, can be order by name, inode, or checksum csv_path = "result.csv" - df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum']) - for x in paths_library: - if x.lstat().st_size > min_file_size * 1024 * 1024: - print(str(datetime.now()) + " : " + str(x)) - new_row = {'FileName': x.name, - 'Path': str(x), - 'inode': x.lstat().st_ino, - 'NumberOfLinks': x.lstat().st_nlink, - 'CheckSum': get_file_checksum(x)} - df_csv = df_csv.append(new_row, ignore_index=True) + df_csv = get_file_dataframe(paths_source, min_file_size, True) if args.order == "inode": df_csv = df_csv.sort_values(by="inode") elif args.order == "name": @@ -103,16 +129,31 @@ if __name__ == "__main__": if args.csv_path: csv_path = args.csv_path df_csv.to_csv(csv_path,index_label='Index') - else: - for x in paths_library: - print(str(x)) - view_unqiue_files(x, min_file_size) if args.move == "True": - target_dir = pathlib.Path(path_inventory) - for x in paths_library: + target_dir = pathlib.Path(path_target) + for x in paths_source: if x.lstat().st_size > min_file_size * 1024 * 1024: print(str(datetime.now()) + " : " + str(x)) new_path = pathlib.Path(target_dir, x.name).resolve() print("New path: " + str(new_path)) - x.rename(new_path) \ No newline at end of file + x.rename(new_path) + + if args.hardlink == 'True': + target_dir = pathlib.Path(path_target) + paths_target = find_all_files(target_dir) + + df_library = get_file_dataframe(paths_source, min_file_size, False) + df_target = get_file_dataframe(paths_target, min_file_size, False) + print(df_library) + print(df_target) + print(df_target.inode) + for index, row in df_library.iterrows(): + print(row.Path) + if row.inode not in df_target.inode.to_list(): + p = pathlib.Path(row.Path) + target_path = pathlib.Path(target_dir, row.FileName) + p.link_to(target_path) + print("Create hardlink of " + str(p) + "at location" + str(target_path)) + else: + print("Is in target directory") \ No newline at end of file