synology_link_manipultor/mv_unique_files.py

159 lines
5.9 KiB
Python

import pathlib
import os
import argparse
from typing import List
import pandas as pd
import hashlib
from datetime import datetime
import csv
from pandas.core.series import Series
from media import Media
def find_all_files(directory: pathlib.Path) -> list:
ps = []
for p in directory.rglob('*'):
if "eaDir" not in str(p):
ps.append((p))
return ps
def view_unqiue_files(p: pathlib.Path, min_size):
if p.lstat().st_size > min_size * 1024 * 1024:
if p.lstat().st_nlink == 1:
print("File with only one hardlink:" + str(p) + "; size: " +
str(p.lstat().st_size/1024/1024) +
"MB with number of links: " + str(p.lstat().st_nlink))
else:
print("File with more than one hardlink:" + str(p) + "; size: " +
str(p.lstat().st_size/1024/1024) +
"MB with number of links: " + str(p.lstat().st_nlink))
def get_file_checksum(p: pathlib.Path):
m = hashlib.md5()
with open(p, 'rb') as f:
while chunk := f.read(8192):
m.update(chunk)
hexvalue = m.hexdigest()
return hexvalue
def get_file_dataframe(ps, min_file_size, bool_checksum: bool) -> pd.DataFrame:
if bool_checksum == True:
df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])
for p in ps:
if p.lstat().st_size > min_file_size * 1024 * 1024:
new_row = {'FileName': p.name,
'Path': str(p),
'inode': p.lstat().st_ino,
'NumberOfLinks': p.lstat().st_nlink,
'CheckSum': get_file_checksum(p)}
df = df.append(new_row, ignore_index=True)
return df
else:
df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks'])
for p in ps:
if p.lstat().st_size > min_file_size * 1024 * 1024:
new_row = {'FileName': p.name,
'Path': str(p),
'inode': p.lstat().st_ino,
'NumberOfLinks': p.lstat().st_nlink}
df = df.append(new_row, ignore_index=True)
return df
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')
parser.add_argument('-s','--source',
type=str,
help='the path to directory with source files')
parser.add_argument('-t', '--target',
type=str,
help='the path to directory to move files')
parser.add_argument('--size',
type=int,
help='Expected minimum file size')
parser.add_argument('--csv',
type=str,
choices=['False','True'],
help='export source folders unique files into csv')
parser.add_argument('--order',
type=str,
help='order exported csv by name or inode',
choices=['inode','name','checksum'])
parser.add_argument('--csv_path',
type=str,
help='path to export csv')
parser.add_argument('--move',
type=str,
help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory',
choices=['False','True'])
parser.add_argument('--hardlink',
type=str,
help='Whether copy files in source directory (via hardlink) to target directory',
choices=['True','False'])
parser.add_argument('--unique',
type=str,
help='Wether the copy file is unqiue in target directory',
choices=['True','False'])
args = parser.parse_args()
min_file_size = 50
if args.source:
path_source = args.source
if args.target:
path_target = args.target
if args.size:
min_file_size = args.size
paths_source = find_all_files(pathlib.Path(path_source))
if args.csv == "True":
# Export csv of source directory to csv_path, can be order by name, inode, or checksum
csv_path = "result.csv"
df_csv = get_file_dataframe(paths_source, min_file_size, True)
if args.order == "inode":
df_csv = df_csv.sort_values(by="inode")
elif args.order == "name":
df_csv = df_csv.sort_values(by="FileName")
elif args.order == "checksum":
df_csv = df_csv.sort_values(by="CheckSum")
if args.csv_path:
csv_path = args.csv_path
df_csv.to_csv(csv_path,index_label='Index')
if args.move == "True":
target_dir = pathlib.Path(path_target)
for x in paths_source:
if x.lstat().st_size > min_file_size * 1024 * 1024:
print(str(datetime.now()) + " : " + str(x))
new_path = pathlib.Path(target_dir, x.name).resolve()
print("New path: " + str(new_path))
x.rename(new_path)
if args.hardlink == 'True':
target_dir = pathlib.Path(path_target)
paths_target = find_all_files(target_dir)
df_library = get_file_dataframe(paths_source, min_file_size, False)
df_target = get_file_dataframe(paths_target, min_file_size, False)
print(df_library)
print(df_target)
print(df_target.inode)
for index, row in df_library.iterrows():
print(row.Path)
if row.inode not in df_target.inode.to_list():
p = pathlib.Path(row.Path)
target_path = pathlib.Path(target_dir, row.FileName)
p.link_to(target_path)
print("Create hardlink of " + str(p) + "at location" + str(target_path))
else:
print("Is in target directory")