spacesavers2_mimeo

#!/usr/bin/env python3
import sys
import os
import gzip
import textwrap
import time
import shutil
import subprocess
import tqdm

MINDEPTH = 3
QUOTA_TB = 20

from src.VersionCheck import version_check
from src.VersionCheck import __version__

version_check()

from src.FileDetails import FileDetails
from src.dfUnit import dfUnit
from src.Summary import Summary
from src.Summary import pathlen
from src.utils import *
from datetime import date

import argparse

def process_hh(
    uid,
    hashhash,
    hashhashsplits,
    mindepth,
    maxdepth,
    uid2uname,
    gid2gname,
    perfolder_summaries,
    perfolder_dups,
    user_output,
):
    for h in hashhash.keys():
        # if files have the same forward and reverse hashes but different sizes then 
        # hashes are split into multiple hashes with <underscore><splitnumber> suffix
        # being added to the bottom hash for each size
        split_required = hashhash[h].compute(
            hashhashsplits
        )  # compute if split is needed or if we have duplicates
        if split_required:
            continue  # split is required so move on to the next hash as new hashes with <underscore><splitnumber> have been created by compute and added to hashhashsplits ... deal with them there!
        # get indexes to files in the flist that belong to user with uid
        # if uid is zero, then get all file indexes
        uid_file_index = hashhash[h].get_user_file_index(uid)
        if len(uid_file_index) == 0: # user with uid has no files in this set
            continue
        oldest_index = hashhash[h].oldest_index
        foldest = hashhash[h].flist[oldest_index]
        user_owns_original = False
        if foldest.uid == uid or 0 == uid : user_owns_original = True
        uid_file_index = list(filter(lambda x:x!=oldest_index,uid_file_index)) # remove oldest if present in list
        inodes_already_summerized = [foldest.inode]
        if user_owns_original:
            fpaths = foldest.get_paths(mindepth, maxdepth)
            for p in fpaths:
                perfolder_summaries[p].nnondup_files += 1
                perfolder_summaries[p].non_dup_Bytes.append(foldest.calculated_size)
                perfolder_summaries[p].non_dup_ages.append(foldest.mtime)
        # if hashhash[h].ndup_files > 0: # we have duplicates
        if len(uid_file_index) > 0: # uid has copies
            for i in uid_file_index:
                f = hashhash[h].flist[i]
                fpath = f.apath
                parent = fpath.parent
                fpaths = f.get_paths(mindepth, maxdepth)
                if f.inode in inodes_already_summerized: # it is a hardlink
                    for p in fpaths:
                        perfolder_summaries[p].nnondup_files += 1
                else:
                    inodes_already_summerized.append(f.inode)
                    if not parent in perfolder_dups:
                        perfolder_dups[fpath.parent] = 0
                    perfolder_dups[fpath.parent] += f.calculated_size
                    for p in fpaths:
                        perfolder_summaries[p].ndup_files+=1
                        perfolder_summaries[p].dup_Bytes.append(f.calculated_size)
                        perfolder_summaries[p].dup_ages.append(f.mtime)
        out_index = []
        out_index.append(oldest_index)
        out_index.extend(uid_file_index)
        if args.duplicatesonly and len(out_index)==1: continue
        user_output.write(
            "{}\n".format(
                hashhash[h].str_with_name(uid2uname, gid2gname, out_index)
            )
        )


def main():
    start = time.time()
    scriptname = os.path.basename(__file__)
    elog = textwrap.dedent(
        """\
    Version:
        {}
    Example:
        > spacesavers2_mimeo -f /output/from/spacesavers2_catalog -o /path/to/output/folder -d 7 -q 10
        """.format(
            __version__
        )
    )
    parser = argparse.ArgumentParser(
        description="spacesavers2_mimeo: find duplicates",
        epilog=elog,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        "-f",
        "--catalog",
        dest="catalog",
        required=True,
        type=str,
        default=sys.stdin,
        help="spacesavers2_catalog output from STDIN or from catalog file",
    )

    parser.add_argument(
        "-d",
        "--maxdepth",
        dest="maxdepth",
        required=False,
        type=int,
        default=10,
        help="folder max. depth upto which reports are aggregated ... absolute path is used to calculate depth (Default: 10)",
    )

    parser.add_argument(
        "-o",
        "--outdir",
        dest="outdir",
        required=False,
        type=str,
        default=os.getcwd(),
        help="output folder",
    )

    parser.add_argument(
        "-p",
        "--prefix",
        dest="prefix",
        required=False,
        type=str,
        default="",
        help="prefix for all output files",
    )

    parser.add_argument(
        "-q",
        "--quota",
        dest="quota",
        required=False,
        type=float,
        default=200.0,
        help="total quota of the mount eg. 200 TB for /data/CCBR",
    )

    parser.add_argument(
        "-z",
        "--duplicatesonly",
        dest="duplicatesonly",
        required=False,
        action=argparse.BooleanOptionalAction,
        help="Print only duplicates to per user output file.",
    )
    
    parser.add_argument(
        "-k",
        "--kronaplot",
        dest="kronaplot",
        required=False,
        action=argparse.BooleanOptionalAction,
        help="Make kronaplots for duplicates.(ktImportText must be in PATH!)",
    )

    parser.add_argument("-v", "--version", action="version", version=__version__)

    print_with_timestamp(
        start=start, scriptname=scriptname, string="version: {}".format(__version__)
    )

    global args
    args = parser.parse_args()
    quota = args.quota * 1024 * 1024 * 1024 * 1024

    if args.kronaplot:
        ktImportText_in_path = False
        if shutil.which("ktImportText") == None:
            sys.stderr.write("ktImportText(from kronaTools) not found in PATH. kronaplots will not be generated.\n")
        else:
            ktImportText_in_path = True

    uid2uname = dict()
    gid2gname = dict()
    hashhash = dict()
    users = set()  # list of all uids found
    users.add(0)  # 0 == all users
    groups = set()  # list of gids
    paths = set() # set of all paths possible 
    path_lens = set() # set of all path depths
    print_with_timestamp(
        start=start, scriptname=scriptname, string="Reading in catalog file..."
    )
    set_complete = True
    folder_info = dict()
    with open(args.catalog) as catalog:
        for l in tqdm.tqdm(catalog): 
            fd = FileDetails()
            set_complete = fd.set(l)
            if not set_complete:
                continue
            if fd.fld != "d" and fd.fld !="f": # not a file or folder
                continue  # ignore all symlinks
            users.add(fd.uid)
            groups.add(fd.gid)
            path_lens.add(fd.get_depth())
            for p in fd.get_paths_at_all_depths():
                paths.add(p)
            if fd.fld == "d":
                if not fd.apath in folder_info:
                    folder_info[fd.apath] = fd
                continue
            hash = fd.xhash_top + "#" + fd.xhash_bottom
            if hash == "#":  # happens when file cannot be read
                sys.stderr.write(
                    "Cannot read file listed in catalog and will be excluded from mimeo: {} \n".format(
                        fd.apath
                    )
                )
                continue
            if not hash in hashhash:
                hashhash[hash] = dfUnit(hash)
            hashhash[hash].add_fd(fd)
    min_path_len = min(path_lens)
    max_path_len = max(path_lens)
    if args.maxdepth > max_path_len:
        mindepth = min_path_len
        maxdepth = max_path_len
    elif args.maxdepth > min_path_len:
        mindepth = min_path_len
        maxdepth = args.maxdepth
    else:
        mindepth = args.maxdepth
        maxdepth = args.maxdepth

    # filter paths by maxdepth
    paths = list(filter(lambda x: get_folder_depth(x) <= maxdepth, paths))
    paths = list(filter(lambda x: get_folder_depth(x) >= mindepth, paths))
    # convert PosixPaths list to list of strings
    paths = list(map(lambda x: str(x), paths))
    # sort paths for outfile aesthetics
    paths.sort()
    # reconvert to paths
    paths = list(map(lambda x: Path(x), paths))
    users = list(users)
    for uid in users:
        uid2uname[uid] = get_username_groupname(uid)
    for gid in groups:
        gid2gname[gid] = get_username_groupname(gid)
    print_with_timestamp(
        start=start, scriptname=scriptname, string="Done reading in catalog file!"
    )
    print_with_timestamp(
        start=start,
        scriptname=scriptname,
        string="Total Number of paths: %d" % len(paths),
    )
    print_with_timestamp(
        start=start,
        scriptname=scriptname,
        string="Total Number of users: %d" % len(users),
    )
    blamematrixtsv = os.path.join(
        os.path.abspath(args.outdir), args.prefix + "." + "blamematrix.tsv"
    )
    blamematrix = dict()
    all_blamematrix_paths = set()
    # users=[0] # debug only
    for uid in users:
        blamematrix[uid] = dict()
        print_with_timestamp(
            start=start,
            scriptname=scriptname,
            string="Gathering stats for user: %s" % (uid2uname[uid]),
        )
        if args.prefix != "":
            outfilenameprefix = args.prefix + "." + get_username_groupname(uid)
        else:
            outfilenameprefix = get_username_groupname(uid)

        summaryfilepath = os.path.join(
            os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.summary.txt"
        )
        useroutputpath = os.path.join(
            os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.files.gz"
        )
        if args.kronaplot:
            kronatsv = os.path.join(
                os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.krona.tsv"
            )
            if ktImportText_in_path:
                kronahtml = os.path.join(
                    os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.krona.html"
                )

        with open(summaryfilepath, "w") as user_summary:
            user_summary.write("%s\n" % (Summary.HEADER))

        with gzip.open(useroutputpath, "wt") as user_output, open(
            summaryfilepath, "a"
        ) as user_summary:
            perfolder_summaries = dict()
            perfolder_dups = dict()
            for p in paths:
                perfolder_summaries[p] = Summary(p)
                if not p in folder_info:
                    folder_info[p] = FileDetails()
                    folder_info[p].initialize(p)
                fd = folder_info[p]
                for p2 in fd.get_paths(mindepth,maxdepth):
                    if not p2 in folder_info:
                        folder_info[p2] = FileDetails()
                        folder_info[p2].initialize(p2)
                    fd2 = folder_info[p2]
                    if fd2.uid == uid or uid == 0:
                        perfolder_summaries[p2].folder_Bytes += fd.calculated_size

            hashhashsplits = dict()  # dict to collect instances where the files are NOT duplicates has same hashes but different sizes (and different inodes) ... new suffix is added to bottomhash .."_iterator"
            process_hh(
                uid,
                hashhash,
                hashhashsplits,
                mindepth,
                maxdepth,
                uid2uname,
                gid2gname,
                perfolder_summaries,
                perfolder_dups,
                user_output,
            )
            if len(hashhashsplits) != 0: 
                hashhashsplitsdummy = dict()
                process_hh(
                    uid,
                    hashhashsplits,
                    hashhashsplitsdummy,
                    mindepth,
                    maxdepth,
                    uid2uname,
                    gid2gname,
                    perfolder_summaries,
                    perfolder_dups,
                    user_output,
                )
                del hashhashsplitsdummy
            del hashhashsplits
            for p in paths:
                perfolder_summaries[p].update_scores(quota)
                user_summary.write(f"{perfolder_summaries[p]}\n")
            for p in perfolder_summaries:
                p_depth = len(list(p.parents))
                if p_depth == mindepth:
                    all_blamematrix_paths.add(p)
                    blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes)
        
        if args.kronaplot:
            print_with_timestamp(
                start=start,
                scriptname=scriptname,
                string="Creating Kronachart for user: %s" % (uid2uname[uid]),
            )
            with open(kronatsv,'w') as ktsv:
                for p in perfolder_dups:
                    path = str(p)
                    path = path.replace('/','\t')
                    path = path.replace('\t\t','\t')
                    if perfolder_dups[p] != 0:
                        ktsv.write("%d\t%s\n"%(perfolder_dups[p],path))
            if ktImportText_in_path:
                cmd = "ktImportText %s -o %s"%(kronatsv,kronahtml)
                srun = subprocess.run(cmd,shell=True, capture_output=True, text=True)
                if srun.returncode !=0:
                    sys.stderr.write("%s\n"%(srun.stderr))    

    del hashhash

    print_with_timestamp(
        start=start,
        scriptname=scriptname,
        string="Creating Blamematrix",
    )
    with open(blamematrixtsv,'w') as btsv:
        outlist = ["path"]
        uids = list(blamematrix.keys())
        uids.sort()
        for uid in uids:
            outlist.append(uid2uname[uid])
        btsv.write("\t".join(outlist)+"\n")
        for p in all_blamematrix_paths:
            outlist = [str(p)]
            s = 0
            for uid in uids:
                if p in blamematrix[uid]:
                    s += blamematrix[uid][p]
                    outlist.append(str(blamematrix[uid][p]))
                else:
                    outlist.append(str(0))
            if s != 0 : btsv.write("\t".join(outlist)+"\n")

    print_with_timestamp(start=start, scriptname=scriptname, string="Finished!")

if __name__ == "__main__":
    main()