synology_link_manipultor/mv_unique_files.py

import pathlib
import os
import argparse
from typing import List
import pandas as pd
import hashlib
from datetime import datetime
import csv

from pandas.core.series import Series

from media import Media

def find_all_files(directory: pathlib.Path) -> list:
    ps = []
    for p in directory.rglob('*'):
        if "eaDir" not in str(p):
            ps.append((p))
    return ps

def view_unqiue_files(p: pathlib.Path, min_size):
    if p.lstat().st_size > min_size * 1024 * 1024:
        if p.lstat().st_nlink == 1:
            print("File with only one hardlink:" + str(p) + "; size: " + 
                str(p.lstat().st_size/1024/1024) + 
                "MB with number of links: " + str(p.lstat().st_nlink))
        else:
            print("File with more than one hardlink:" + str(p) + "; size: " + 
                str(p.lstat().st_size/1024/1024) + 
                "MB with number of links: " + str(p.lstat().st_nlink))

def get_file_checksum(p: pathlib.Path):
    m = hashlib.md5()
    with open(p, 'rb') as f: 
        while chunk := f.read(8192):
            m.update(chunk)
    hexvalue = m.hexdigest()
    print(hexvalue)
    return hexvalue

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')

    parser.add_argument('-s','--source',
                        type=str,
                        help='the path to directory with source files')

    parser.add_argument('-t', '--target',
                        type=str,
                        help='the path to directory to move files')

    parser.add_argument('--size',
                        type=int,
                        help='Expected minimum file size')

    parser.add_argument('--csv',
                        type=str,
                        choices=['False','True'],
                        help='export source folders unique files into csv')

    parser.add_argument('--order',
                        type=str,
                        help='order exported csv by name or inode',
                        choices=['inode','name','checksum'])
    
    parser.add_argument('--csv_path',
                        type=str,
                        help='path to export csv')

    parser.add_argument('--move',
                        type=str,
                        help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory')

    args = parser.parse_args()
    min_file_size = 50
    path_library = args.source
    path_inventory = args.target
    
    if args.size:
        min_file_size = args.size
    paths = find_all_files(pathlib.Path(path_library))

    if args.csv == "True":
        csv_path = "result.csv"
        df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])
        for x in paths:
            if x.lstat().st_size > min_file_size * 1024 * 1024:
                print(str(datetime.now()) + " : " + str(x))
                new_row = {'FileName': x.name,
                            'Path': str(x),
                            'inode': x.lstat().st_ino,
                            'NumberOfLinks': x.lstat().st_nlink,
                            'CheckSum': get_file_checksum(x)}
                df_csv = df_csv.append(new_row, ignore_index=True)
        if args.order == "inode":
            df_csv = df_csv.sort_values(by="inode")
        elif args.order == "name":
            df_csv = df_csv.sort_values(by="FileName")
        elif args.order == "checksum":
            df_csv = df_csv.sort_values(by="CheckSum")
        
        if args.csv_path:
            csv_path = args.csv_path
        df_csv.to_csv(csv_path,index_label='Index')
    else:
        for x in paths:
            print(str(x))
            view_unqiue_files(x, min_file_size)

#                dst_path = pathlib.Path(os.path.join(path_inventory,x.name))
#                print("Its new path: " + str(dst_path))
#                x.link_to(dst_path)
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`import pathlib`
			`import os`
			`import argparse`
Added export csv 2021-08-15 00:13:02 +10:00			`from typing import List`
			`import pandas as pd`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`import hashlib`
For better hexvalue print out 2021-08-15 15:55:29 +10:00			`from datetime import datetime`
Added export csv 2021-08-15 00:13:02 +10:00			`import csv`

			`from pandas.core.series import Series`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
			`from media import Media`

Added export csv 2021-08-15 00:13:02 +10:00			`def find_all_files(directory: pathlib.Path) -> list:`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`ps = []`
			`for p in directory.rglob('*'):`
			`if "eaDir" not in str(p):`
			`ps.append((p))`
			`return ps`

Added export csv 2021-08-15 00:13:02 +10:00			`def view_unqiue_files(p: pathlib.Path, min_size):`
			`if p.lstat().st_size > min_size * 1024 * 1024:`
			`if p.lstat().st_nlink == 1:`
			`print("File with only one hardlink:" + str(p) + "; size: " +`
			`str(p.lstat().st_size/1024/1024) +`
			`"MB with number of links: " + str(p.lstat().st_nlink))`
			`else:`
			`print("File with more than one hardlink:" + str(p) + "; size: " +`
			`str(p.lstat().st_size/1024/1024) +`
			`"MB with number of links: " + str(p.lstat().st_nlink))`

Added order by CheckSum 2021-08-15 13:11:59 +10:00			`def get_file_checksum(p: pathlib.Path):`
			`m = hashlib.md5()`
			`with open(p, 'rb') as f:`
			`while chunk := f.read(8192):`
			`m.update(chunk)`
For better hexvalue print out 2021-08-15 15:55:29 +10:00			`hexvalue = m.hexdigest()`
			`print(hexvalue)`
			`return hexvalue`
Added order by CheckSum 2021-08-15 13:11:59 +10:00
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')`

Added export csv 2021-08-15 00:13:02 +10:00			`parser.add_argument('-s','--source',`
			`type=str,`
			`help='the path to directory with source files')`

			`parser.add_argument('-t', '--target',`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`type=str,`
Added export csv 2021-08-15 00:13:02 +10:00			`help='the path to directory to move files')`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added export csv 2021-08-15 00:13:02 +10:00			`parser.add_argument('--size',`
			`type=int,`
			`help='Expected minimum file size')`

			`parser.add_argument('--csv',`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`type=str,`
Added export csv 2021-08-15 00:13:02 +10:00			`choices=['False','True'],`
			`help='export source folders unique files into csv')`

			`parser.add_argument('--order',`
			`type=str,`
			`help='order exported csv by name or inode',`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`choices=['inode','name','checksum'])`
Added export csv 2021-08-15 00:13:02 +10:00
			`parser.add_argument('--csv_path',`
			`type=str,`
			`help='path to export csv')`

			`parser.add_argument('--move',`
			`type=str,`
			`help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory')`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
			`args = parser.parse_args()`
Added export csv 2021-08-15 00:13:02 +10:00			`min_file_size = 50`
			`path_library = args.source`
			`path_inventory = args.target`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added export csv 2021-08-15 00:13:02 +10:00			`if args.size:`
			`min_file_size = args.size`
			`paths = find_all_files(pathlib.Path(path_library))`

			`if args.csv == "True":`
			`csv_path = "result.csv"`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`df_csv = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])`
Added export csv 2021-08-15 00:13:02 +10:00			`for x in paths:`
			`if x.lstat().st_size > min_file_size * 1024 * 1024:`
For better hexvalue print out 2021-08-15 15:55:29 +10:00			`print(str(datetime.now()) + " : " + str(x))`
Added export csv 2021-08-15 00:13:02 +10:00			`new_row = {'FileName': x.name,`
			`'Path': str(x),`
			`'inode': x.lstat().st_ino,`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`'NumberOfLinks': x.lstat().st_nlink,`
			`'CheckSum': get_file_checksum(x)}`
Added export csv 2021-08-15 00:13:02 +10:00			`df_csv = df_csv.append(new_row, ignore_index=True)`
			`if args.order == "inode":`
Corrected ifelse condition 2021-08-15 11:25:14 +10:00			`df_csv = df_csv.sort_values(by="inode")`
			`elif args.order == "name":`
			`df_csv = df_csv.sort_values(by="FileName")`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`elif args.order == "checksum":`
			`df_csv = df_csv.sort_values(by="CheckSum")`
Added export csv 2021-08-15 00:13:02 +10:00
			`if args.csv_path:`
			`csv_path = args.csv_path`
ready to delete duplicated files 2021-08-15 19:33:40 +10:00			`df_csv.to_csv(csv_path,index_label='Index')`
Added export csv 2021-08-15 00:13:02 +10:00			`else:`
			`for x in paths:`
Added notification 2021-08-15 13:13:31 +10:00			`print(str(x))`
Added export csv 2021-08-15 00:13:02 +10:00			`view_unqiue_files(x, min_file_size)`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
			`# dst_path = pathlib.Path(os.path.join(path_inventory,x.name))`
			`# print("Its new path: " + str(dst_path))`
			`# x.link_to(dst_path)`