synology_link_manipultor/deal_duplicate_files.py

52 lines
1.7 KiB
Python
Raw Normal View History

2021-08-15 17:12:10 +10:00
import pathlib
2021-08-15 19:33:40 +10:00
import os
2021-08-15 17:12:10 +10:00
import argparse
from numpy import source
import pandas as pd
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Deal with duplicate files in synology')
parser.add_argument('-s', '--source',
type=str,
help='csv file to the directory to be de-duplicate')
parser.add_argument('-t', '--target',
type=str,
help='csv file to the directory that be compared')
2021-08-15 19:33:40 +10:00
parser.add_argument('--remove',
type=str,
help='remove duplicated file or not',
choices=['True','False'])
2021-08-15 17:12:10 +10:00
args = parser.parse_args()
csv_source = args.source
csv_target = args.target
df_source = pd.DataFrame(pd.read_csv(csv_source, index_col='Index', dtype={'NumberOfLinks':'int', 'inode':'int'}))
df_target = pd.DataFrame(pd.read_csv(csv_target, index_col='Index', dtype={'NumberOfLinks':'int', 'inode':'int'}))
df_filtered = pd.DataFrame()
for index, row in df_source.iterrows():
checksum = row['CheckSum']
print(checksum)
for yrow in df_target.CheckSum:
if checksum == yrow:
print(row)
df_filtered = df_filtered.append(row)
print("Found Duplicate")
print(df_filtered)
print("Source Directory")
print(df_source)
print("Target")
print(df_target)
for row in df_filtered.Path:
p = pathlib.Path(row)
2021-08-15 19:33:40 +10:00
print(str(p) + ": " + str(p.exists()))
if args.remove == "True" and p.exists() == True:
os.remove(p)
print(str(p) + " Removed")