From 5ed4d99f4316e9fe49e97c47cccd960e5ac52a67 Mon Sep 17 00:00:00 2001 From: JasonHomeWorkstationUbuntu Date: Sun, 15 Aug 2021 17:12:10 +1000 Subject: [PATCH] Created deal_duplicate_files.py --- deal_duplicate_files.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 deal_duplicate_files.py diff --git a/deal_duplicate_files.py b/deal_duplicate_files.py new file mode 100644 index 0000000..d6672f9 --- /dev/null +++ b/deal_duplicate_files.py @@ -0,0 +1,43 @@ +import pathlib +import argparse +from numpy import source +import pandas as pd + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Deal with duplicate files in synology') + + parser.add_argument('-s', '--source', + type=str, + + help='csv file to the directory to be de-duplicate') + + parser.add_argument('-t', '--target', + type=str, + help='csv file to the directory that be compared') + + args = parser.parse_args() + csv_source = args.source + csv_target = args.target + + df_source = pd.DataFrame(pd.read_csv(csv_source, index_col='Index', dtype={'NumberOfLinks':'int', 'inode':'int'})) + df_target = pd.DataFrame(pd.read_csv(csv_target, index_col='Index', dtype={'NumberOfLinks':'int', 'inode':'int'})) + + df_filtered = pd.DataFrame() + for index, row in df_source.iterrows(): + checksum = row['CheckSum'] + print(checksum) + for yrow in df_target.CheckSum: + if checksum == yrow: + print(row) + df_filtered = df_filtered.append(row) + + print("Found Duplicate") + print(df_filtered) + print("Source Directory") + print(df_source) + print("Target") + print(df_target) + + for row in df_filtered.Path: + p = pathlib.Path(row) + print(str(p) + ": " + str(p.exists())) \ No newline at end of file