synology_link_manipultor/mv_unique_files.py

import pathlib
import os
import argparse
from typing import List
import pandas as pd
import hashlib
from datetime import datetime
import csv

from pandas.core.series import Series

from media import Media

def find_all_files(directory: pathlib.Path) -> list:
    ps = []
    for p in directory.rglob('*'):
        if "eaDir" not in str(p):
            ps.append((p))
    return ps

def view_unqiue_files(p: pathlib.Path, min_size):
    if p.lstat().st_size > min_size * 1024 * 1024:
        if p.lstat().st_nlink == 1:
            print("File with only one hardlink:" + str(p) + "; size: " + 
                str(p.lstat().st_size/1024/1024) + 
                "MB with number of links: " + str(p.lstat().st_nlink))
        else:
            print("File with more than one hardlink:" + str(p) + "; size: " + 
                str(p.lstat().st_size/1024/1024) + 
                "MB with number of links: " + str(p.lstat().st_nlink))

def get_file_checksum(p: pathlib.Path):
    m = hashlib.md5()
    with open(p, 'rb') as f: 
        while chunk := f.read(8192):
            m.update(chunk)
    hexvalue = m.hexdigest()
    return hexvalue

def get_file_dataframe(ps, min_file_size, bool_checksum: bool) -> pd.DataFrame:
    if bool_checksum == True:
        df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])
        for p in ps:
            if p.lstat().st_size > min_file_size * 1024 * 1024:
                new_row = {'FileName': p.name,
                            'Path': str(p),
                            'inode': p.lstat().st_ino,
                            'NumberOfLinks': p.lstat().st_nlink,
                            'CheckSum': get_file_checksum(p)}
                df = df.append(new_row, ignore_index=True)
        return df
    else:
        df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks'])
        for p in ps:
            if p.lstat().st_size > min_file_size * 1024 * 1024:
                new_row = {'FileName': p.name,
                            'Path': str(p),
                            'inode': p.lstat().st_ino,
                            'NumberOfLinks': p.lstat().st_nlink}
                df = df.append(new_row, ignore_index=True)
        return df

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')

    parser.add_argument('-s','--source',
                        type=str,
                        help='the path to directory with source files')

    parser.add_argument('-t', '--target',
                        type=str,
                        help='the path to directory to move files')

    parser.add_argument('--size',
                        type=int,
                        help='Expected minimum file size')

    parser.add_argument('--csv',
                        type=str,
                        choices=['False','True'],
                        help='export source folders unique files into csv')

    parser.add_argument('--order',
                        type=str,
                        help='order exported csv by name or inode',
                        choices=['inode','name','checksum'])
    
    parser.add_argument('--csv_path',
                        type=str,
                        help='path to export csv')

    parser.add_argument('--move',
                        type=str,
                        help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory',
                        choices=['False','True'])

    parser.add_argument('--hardlink',
                        type=str,
                        help='Whether copy files in source directory (via hardlink) to target directory',
                        choices=['True','False'])

    parser.add_argument('--unique',
                        type=str,
                        help='Wether the copy file is unqiue in target directory',
                        choices=['True','False'])

    args = parser.parse_args()
    min_file_size = 50
    if args.source:
        path_source = args.source
    if args.target:
        path_target = args.target
    
    if args.size:
        min_file_size = args.size
    paths_source = find_all_files(pathlib.Path(path_source))

    if args.csv == "True":
        # Export csv of source directory to csv_path, can be order by name, inode, or checksum
        csv_path = "result.csv"
        df_csv = get_file_dataframe(paths_source, min_file_size, True)
        if args.order == "inode":
            df_csv = df_csv.sort_values(by="inode")
        elif args.order == "name":
            df_csv = df_csv.sort_values(by="FileName")
        elif args.order == "checksum":
            df_csv = df_csv.sort_values(by="CheckSum")
        
        if args.csv_path:
            csv_path = args.csv_path
        df_csv.to_csv(csv_path,index_label='Index')

    if args.move == "True":
        target_dir = pathlib.Path(path_target)
        for x in paths_source:
            if x.lstat().st_size > min_file_size * 1024 * 1024:
                print(str(datetime.now()) + " : " + str(x))
                new_path = pathlib.Path(target_dir, x.name).resolve()
                print("New path: " + str(new_path))
                x.rename(new_path)

    if args.hardlink == 'True':
        target_dir = pathlib.Path(path_target)
        paths_target = find_all_files(target_dir)

        df_library = get_file_dataframe(paths_source, min_file_size, False)
        df_target = get_file_dataframe(paths_target, min_file_size, False)
        print(df_library)
        print(df_target)
        print(df_target.inode)
        for index, row in df_library.iterrows():
            print(row.Path)
            if row.inode not in df_target.inode.to_list():
                p = pathlib.Path(row.Path)
                target_path = pathlib.Path(target_dir, row.FileName)
                p.link_to(target_path)
                print("Create hardlink of " + str(p) + "at location" + str(target_path))
            else:
                print("Is in target directory")
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`import pathlib`
			`import os`
			`import argparse`
Added export csv 2021-08-15 00:13:02 +10:00			`from typing import List`
			`import pandas as pd`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`import hashlib`
For better hexvalue print out 2021-08-15 15:55:29 +10:00			`from datetime import datetime`
Added export csv 2021-08-15 00:13:02 +10:00			`import csv`

			`from pandas.core.series import Series`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
			`from media import Media`

Added export csv 2021-08-15 00:13:02 +10:00			`def find_all_files(directory: pathlib.Path) -> list:`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`ps = []`
			`for p in directory.rglob('*'):`
			`if "eaDir" not in str(p):`
			`ps.append((p))`
			`return ps`

Added export csv 2021-08-15 00:13:02 +10:00			`def view_unqiue_files(p: pathlib.Path, min_size):`
			`if p.lstat().st_size > min_size * 1024 * 1024:`
			`if p.lstat().st_nlink == 1:`
			`print("File with only one hardlink:" + str(p) + "; size: " +`
			`str(p.lstat().st_size/1024/1024) +`
			`"MB with number of links: " + str(p.lstat().st_nlink))`
			`else:`
			`print("File with more than one hardlink:" + str(p) + "; size: " +`
			`str(p.lstat().st_size/1024/1024) +`
			`"MB with number of links: " + str(p.lstat().st_nlink))`

Added order by CheckSum 2021-08-15 13:11:59 +10:00			`def get_file_checksum(p: pathlib.Path):`
			`m = hashlib.md5()`
			`with open(p, 'rb') as f:`
			`while chunk := f.read(8192):`
			`m.update(chunk)`
For better hexvalue print out 2021-08-15 15:55:29 +10:00			`hexvalue = m.hexdigest()`
			`return hexvalue`
Added order by CheckSum 2021-08-15 13:11:59 +10:00
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`def get_file_dataframe(ps, min_file_size, bool_checksum: bool) -> pd.DataFrame:`
			`if bool_checksum == True:`
			`df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks', 'CheckSum'])`
			`for p in ps:`
			`if p.lstat().st_size > min_file_size * 1024 * 1024:`
			`new_row = {'FileName': p.name,`
			`'Path': str(p),`
			`'inode': p.lstat().st_ino,`
			`'NumberOfLinks': p.lstat().st_nlink,`
			`'CheckSum': get_file_checksum(p)}`
			`df = df.append(new_row, ignore_index=True)`
			`return df`
			`else:`
			`df = pd.DataFrame(columns=['FileName', 'Path', 'inode', 'NumberOfLinks'])`
			`for p in ps:`
			`if p.lstat().st_size > min_file_size * 1024 * 1024:`
			`new_row = {'FileName': p.name,`
			`'Path': str(p),`
			`'inode': p.lstat().st_ino,`
			`'NumberOfLinks': p.lstat().st_nlink}`
			`df = df.append(new_row, ignore_index=True)`
			`return df`

Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description='convert hardlink to symlink in download folder')`

Added export csv 2021-08-15 00:13:02 +10:00			`parser.add_argument('-s','--source',`
			`type=str,`
			`help='the path to directory with source files')`

			`parser.add_argument('-t', '--target',`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`type=str,`
Added export csv 2021-08-15 00:13:02 +10:00			`help='the path to directory to move files')`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added export csv 2021-08-15 00:13:02 +10:00			`parser.add_argument('--size',`
			`type=int,`
			`help='Expected minimum file size')`

			`parser.add_argument('--csv',`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`type=str,`
Added export csv 2021-08-15 00:13:02 +10:00			`choices=['False','True'],`
			`help='export source folders unique files into csv')`

			`parser.add_argument('--order',`
			`type=str,`
			`help='order exported csv by name or inode',`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`choices=['inode','name','checksum'])`
Added export csv 2021-08-15 00:13:02 +10:00
			`parser.add_argument('--csv_path',`
			`type=str,`
			`help='path to export csv')`

			`parser.add_argument('--move',`
			`type=str,`
Added moving mechanism 2021-08-15 20:05:24 +10:00			`help='Confirm whether to move all files with NumberOfLinks as 1 from source directory to target directory',`
			`choices=['False','True'])`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`parser.add_argument('--hardlink',`
			`type=str,`
			`help='Whether copy files in source directory (via hardlink) to target directory',`
			`choices=['True','False'])`

			`parser.add_argument('--unique',`
			`type=str,`
			`help='Wether the copy file is unqiue in target directory',`
			`choices=['True','False'])`

Moving manage and moving unique files 2021-08-14 21:45:26 +10:00			`args = parser.parse_args()`
Added export csv 2021-08-15 00:13:02 +10:00			`min_file_size = 50`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`if args.source:`
			`path_source = args.source`
			`if args.target:`
			`path_target = args.target`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added export csv 2021-08-15 00:13:02 +10:00			`if args.size:`
			`min_file_size = args.size`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`paths_source = find_all_files(pathlib.Path(path_source))`
Added export csv 2021-08-15 00:13:02 +10:00
			`if args.csv == "True":`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`# Export csv of source directory to csv_path, can be order by name, inode, or checksum`
Added export csv 2021-08-15 00:13:02 +10:00			`csv_path = "result.csv"`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`df_csv = get_file_dataframe(paths_source, min_file_size, True)`
Added export csv 2021-08-15 00:13:02 +10:00			`if args.order == "inode":`
Corrected ifelse condition 2021-08-15 11:25:14 +10:00			`df_csv = df_csv.sort_values(by="inode")`
			`elif args.order == "name":`
			`df_csv = df_csv.sort_values(by="FileName")`
Added order by CheckSum 2021-08-15 13:11:59 +10:00			`elif args.order == "checksum":`
			`df_csv = df_csv.sort_values(by="CheckSum")`
Added export csv 2021-08-15 00:13:02 +10:00
			`if args.csv_path:`
			`csv_path = args.csv_path`
ready to delete duplicated files 2021-08-15 19:33:40 +10:00			`df_csv.to_csv(csv_path,index_label='Index')`
Moving manage and moving unique files 2021-08-14 21:45:26 +10:00
Added moving mechanism 2021-08-15 20:05:24 +10:00			`if args.move == "True":`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`target_dir = pathlib.Path(path_target)`
			`for x in paths_source:`
Added moving mechanism 2021-08-15 20:05:24 +10:00			`if x.lstat().st_size > min_file_size * 1024 * 1024:`
			`print(str(datetime.now()) + " : " + str(x))`
			`new_path = pathlib.Path(target_dir, x.name).resolve()`
			`print("New path: " + str(new_path))`
Added create hardlink feature 2021-08-15 22:19:17 +10:00			`x.rename(new_path)`

			`if args.hardlink == 'True':`
			`target_dir = pathlib.Path(path_target)`
			`paths_target = find_all_files(target_dir)`

			`df_library = get_file_dataframe(paths_source, min_file_size, False)`
			`df_target = get_file_dataframe(paths_target, min_file_size, False)`
			`print(df_library)`
			`print(df_target)`
			`print(df_target.inode)`
			`for index, row in df_library.iterrows():`
			`print(row.Path)`
			`if row.inode not in df_target.inode.to_list():`
			`p = pathlib.Path(row.Path)`
			`target_path = pathlib.Path(target_dir, row.FileName)`
			`p.link_to(target_path)`
			`print("Create hardlink of " + str(p) + "at location" + str(target_path))`
			`else:`
			`print("Is in target directory")`