synology_link_manipultor/mv_unique_files.py

159 lines
5.9 KiB
Python
Raw Permalink Normal View History

2021-08-14 21:45:26 +10:00
import pathlib
import os
import argparse
2021-08-15 00:13:02 +10:00
from typing import List
import pandas as pd
2021-08-15 13:11:59 +10:00
import hashlib
2021-08-15 15:55:29 +10:00
from datetime import datetime
2021-08-15 00:13:02 +10:00
import csv
from pandas.core.series import Series
2021-08-14 21:45:26 +10:00
from media import Media
2021-08-15 00:13:02 +10:00
def find_all_files(directory: pathlib.Path) -> list:
2021-08-14 21:45:26 +10:00
ps = []
for p in directory.rglob('*'):
if "eaDir" not in str(p):
ps.append((p))
return ps
2021-08-15 00:13:02 +10:00
def view_unqiue_files(p: pathlib.Path, min_size):
if p.lstat().st_size > min_size * 1024 * 1024:
if p.lstat().st_nlink == 1:
print("File with only one hardlink:" + str(p) + "; size: " +
str(p.lstat().st_size/1024/1024) +
"MB with number of links: " + str(p.lstat().st_nlink))
else:
print("File with more than one hardlink:" + str(p) + "; size: " +
str(p.lstat().st_size/1024/1024) +
"MB with number of links: " + str(p.lstat().st_nlink))
2021-08-15 13:11:59 +10:00
def get_file_checksum(p: pathlib.Path):
m = hashlib.md5()
with open(p, 'rb') as f:
while chunk := f.read(8192):
m.update(chunk)
2021-08-15 15:55:29 +10:00
hexvalue = m.hexdigest()
return hexvalue
2021-08-15 13:11:59 +10:00
2021-08-15 22:19:17 +10:00
def get_file_dataframe(ps, min_file_size, bool_checksum: bool) -> pd.DataFrame:
if bool_checksum == True:
df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])
for p in ps:
if p.lstat().st_size > min_file_size * 1024 * 1024:
new_row = {'FileName': p.name,
'Path': str(p),
'inode': p.lstat().st_ino,
'NumberOfLinks': p.lstat().st_nlink,
'CheckSum': get_file_checksum(p)}
df = df.append(new_row, ignore_index=True)
return df
else:
df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks'])
for p in ps:
if p.lstat().st_size > min_file_size * 1024 * 1024:
new_row = {'FileName': p.name,
'Path': str(p),
'inode': p.lstat().st_ino,
'NumberOfLinks': p.lstat().st_nlink}
df = df.append(new_row, ignore_index=True)
return df
2021-08-14 21:45:26 +10:00
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')
2021-08-15 00:13:02 +10:00
parser.add_argument('-s','--source',
type=str,
help='the path to directory with source files')
parser.add_argument('-t', '--target',
2021-08-14 21:45:26 +10:00
type=str,
2021-08-15 00:13:02 +10:00
help='the path to directory to move files')
2021-08-14 21:45:26 +10:00
2021-08-15 00:13:02 +10:00
parser.add_argument('--size',
type=int,
help='Expected minimum file size')
parser.add_argument('--csv',
2021-08-14 21:45:26 +10:00
type=str,
2021-08-15 00:13:02 +10:00
choices=['False','True'],
help='export source folders unique files into csv')
parser.add_argument('--order',
type=str,
help='order exported csv by name or inode',
2021-08-15 13:11:59 +10:00
choices=['inode','name','checksum'])
2021-08-15 00:13:02 +10:00
parser.add_argument('--csv_path',
type=str,
help='path to export csv')
parser.add_argument('--move',
type=str,
2021-08-15 20:05:24 +10:00
help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory',
choices=['False','True'])
2021-08-14 21:45:26 +10:00
2021-08-15 22:19:17 +10:00
parser.add_argument('--hardlink',
type=str,
help='Whether copy files in source directory (via hardlink) to target directory',
choices=['True','False'])
parser.add_argument('--unique',
type=str,
help='Wether the copy file is unqiue in target directory',
choices=['True','False'])
2021-08-14 21:45:26 +10:00
args = parser.parse_args()
2021-08-15 00:13:02 +10:00
min_file_size = 50
2021-08-15 22:19:17 +10:00
if args.source:
path_source = args.source
if args.target:
path_target = args.target
2021-08-14 21:45:26 +10:00
2021-08-15 00:13:02 +10:00
if args.size:
min_file_size = args.size
2021-08-15 22:19:17 +10:00
paths_source = find_all_files(pathlib.Path(path_source))
2021-08-15 00:13:02 +10:00
if args.csv == "True":
2021-08-15 22:19:17 +10:00
# Export csv of source directory to csv_path, can be order by name, inode, or checksum
2021-08-15 00:13:02 +10:00
csv_path = "result.csv"
2021-08-15 22:19:17 +10:00
df_csv = get_file_dataframe(paths_source, min_file_size, True)
2021-08-15 00:13:02 +10:00
if args.order == "inode":
2021-08-15 11:25:14 +10:00
df_csv = df_csv.sort_values(by="inode")
elif args.order == "name":
df_csv = df_csv.sort_values(by="FileName")
2021-08-15 13:11:59 +10:00
elif args.order == "checksum":
df_csv = df_csv.sort_values(by="CheckSum")
2021-08-15 00:13:02 +10:00
if args.csv_path:
csv_path = args.csv_path
2021-08-15 19:33:40 +10:00
df_csv.to_csv(csv_path,index_label='Index')
2021-08-14 21:45:26 +10:00
2021-08-15 20:05:24 +10:00
if args.move == "True":
2021-08-15 22:19:17 +10:00
target_dir = pathlib.Path(path_target)
for x in paths_source:
2021-08-15 20:05:24 +10:00
if x.lstat().st_size > min_file_size * 1024 * 1024:
print(str(datetime.now()) + " : " + str(x))
new_path = pathlib.Path(target_dir, x.name).resolve()
print("New path: " + str(new_path))
2021-08-15 22:19:17 +10:00
x.rename(new_path)
if args.hardlink == 'True':
target_dir = pathlib.Path(path_target)
paths_target = find_all_files(target_dir)
df_library = get_file_dataframe(paths_source, min_file_size, False)
df_target = get_file_dataframe(paths_target, min_file_size, False)
print(df_library)
print(df_target)
print(df_target.inode)
for index, row in df_library.iterrows():
print(row.Path)
if row.inode not in df_target.inode.to_list():
p = pathlib.Path(row.Path)
target_path = pathlib.Path(target_dir, row.FileName)
p.link_to(target_path)
print("Create hardlink of " + str(p) + "at location" + str(target_path))
else:
print("Is in target directory")