diff --git a/mv_unique_files.py b/mv_unique_files.py index 5fe4174..636b78f 100644 --- a/mv_unique_files.py +++ b/mv_unique_files.py @@ -3,6 +3,7 @@ import os import argparse from typing import List import pandas as pd +import hashlib import csv from pandas.core.series import Series @@ -27,6 +28,13 @@ def view_unqiue_files(p: pathlib.Path, min_size): str(p.lstat().st_size/1024/1024) + "MB with number of links: " + str(p.lstat().st_nlink)) +def get_file_checksum(p: pathlib.Path): + m = hashlib.md5() + with open(p, 'rb') as f: + while chunk := f.read(8192): + m.update(chunk) + return m.hexdigest() + if __name__ == "__main__": parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder') @@ -50,7 +58,7 @@ if __name__ == "__main__": parser.add_argument('--order', type=str, help='order exported csv by name or inode', - choices=['inode','name']) + choices=['inode','name','checksum']) parser.add_argument('--csv_path', type=str, @@ -71,18 +79,21 @@ if __name__ == "__main__": if args.csv == "True": csv_path = "result.csv" - df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks']) + df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum']) for x in paths: if x.lstat().st_size > min_file_size * 1024 * 1024: new_row = {'FileName': x.name, 'Path': str(x), 'inode': x.lstat().st_ino, - 'NumberOfLinks': x.lstat().st_nlink} + 'NumberOfLinks': x.lstat().st_nlink, + 'CheckSum': get_file_checksum(x)} df_csv = df_csv.append(new_row, ignore_index=True) if args.order == "inode": df_csv = df_csv.sort_values(by="inode") elif args.order == "name": df_csv = df_csv.sort_values(by="FileName") + elif args.order == "checksum": + df_csv = df_csv.sort_values(by="CheckSum") if args.csv_path: csv_path = args.csv_path