# tentative parquet reader dictionnary in line need parquet-reader
# cli installed
import json
import subprocess
import re
import os
import pandas as pd
from utils.proForma import proFormaTranslateUnimodDiannToMass


class parquetReader():

    def __init__(self, filename):
        if os.path.isfile(filename):
            self.filename = filename
            chars = subprocess.run(["parquet-reader",  "--json", filename],
                                   capture_output=True, text=True)
            self.parquetInfo = json.loads(chars.stdout)
            self.columnInfos = dict()
            for column in self.parquetInfo["Columns"]:
                if column["LogicalType"]["Type"] != "None":
                    self.columnInfos[column["Name"]] = {
                        "Id": column["Id"],
                        "Type": column["LogicalType"]["Type"]}
                else:
                    self.columnInfos[column["Name"]] = {"Id": column["Id"],
                                                        "Type": "Float"}
            # print(self.columnInfos)
            self.TotalRows = int(self.parquetInfo["TotalRows"])
            print(f"Nb Columns : {len(self.columnInfos.keys())}")
            print(f"Nb Rows : {self.TotalRows}")
        else:
            raise Exception(f"{filename} does not exist, \
                            please verify spelling or file location")

    def parquetRead(self, columnNames, seqlist=None):
        seqlist = seqlist
        print(f"reading {self.filename}")
        self.dictCols = dict()
        # retrieve column of intereset in columnNames
        for columnName in columnNames:
            print(f"Loading {columnName} column from {self.filename}")
            Id = self.columnInfos[columnName]["Id"]
            columnExtract = subprocess.run(["parquet-reader",
                                            "--dump",
                                            f"--columns={Id}",
                                            self.filename],
                                            capture_output=True,
                                            text=True).stdout.split("\n")
            columnExtract = columnExtract[19:self.TotalRows+19]
            # print(columnExtract[0])
            columnExtract = [re.sub(r"^.*V:", "", x) for x in columnExtract]
            # print(columnExtract[0])
            self.dictCols[columnName] = columnExtract
        self.dictCols = pd.DataFrame(self.dictCols)
        # print(self.dictCols)
        for column in columnNames:
            if self.columnInfos[columnName]["Type"] == "Float":
                self.dictCols[columnName] = self.dictCols[columnName].astype("float")
            if self.columnInfos[columnName]["Type"] == "Int":
                self.dictCols[columnName] = self.dictCols[columnName].astype("int")
            else:
                self.dictCols[columnName] = self.dictCols[columnName].astype("str")
        # print(self.dictCols.dtypes)
        if seqlist is not None:
            print("Select only infered peptides")
            self.dictCols = self.dictCols[
                self.dictCols["Stripped.Sequence"].isin(seqlist)]
            print(f"Number of selected lines : {self.dictCols.shape[0]}")
        # generate proForma cell and proForma_no_loc
        print("Generate proForma column")
        self.dictCols["proForma_key"] = self.dictCols["Modified.Sequence"].apply(proFormaTranslateUnimodDiannToMass)
        print("Generate proForma column with \
            unspecified modification position")
        print("Replace . in column by _")
        self.dictCols["proForma_no_loc"] = self.dictCols["Modified.Sequence"].apply(proFormaTranslateUnimodDiannToMass, no_pose=True)
        self.dictCols.columns = [re.sub("\.", "_", x) for x in self.dictCols.columns]
        self.dictCols = self.dictCols.astype({"Precursor_Charge": "float", "Precursor_Mz": "float", "Precursor_Quantity": "float", "RT": "float", "Quantity_Quality": "float", "PG_Q_Value": "float"})
        print(f'The result files contains {self.dictCols.shape[0]} lines and {self.dictCols.shape[1]}')

    def getMsrunfileNames(self):
        return self.dictCols['Run'].unique().tolist()

    def filteringResult(self, col_threshold):
        before_filtering = self.dictCols.shape[0]
        for col in col_threshold:
            if col in self.dictCols.columns.tolist():
                if col == "Quantity_Quality":
                    self.dictCols["Quantity_Quality"] = self.dictCols[self.dictCols["Quantity_Quality"].astype(float) > col_threshold[col]]["Quantity_Quality"]
                else:
                    self.dictCols[col] = self.dictCols[self.dictCols[col].astype(float) < col_threshold[col]][col]
        self.dictCols = self.dictCols.dropna()
        print(f"Number of line removed : {before_filtering - self.dictCols.shape[0]}")

    # def filteringResult(self, QuantityQuality = 0.05, PGQValue = 0.01):
    #     print(self.dictCols.shape[0])
    #     self.dictCols["Quantity_Quality"] = self.dictCols[self.dictCols["Quantity_Quality"].astype(float) > QuantityQuality]["Quantity_Quality"]
    #     self.dictCols["PG_Q_Value"] = self.dictCols[self.dictCols["PG_Q_Value"].astype(float) < PGQValue]["PG_Q_Value"]
    #     self.dictCols = self.dictCols.dropna()
    #     print(self.dictCols.shape[0])

    def mergePeptidoform(self):
        print(self.dictCols.shape[0])
        self.dictCols = self.dictCols.groupby(["Run", "Run_Index", "Stripped_Sequence",
                    "Precursor_Charge", "Protein_Group", "proForma_no_loc"], as_index=False).agg({"Quantity_Quality": 'min', "PG_Q_Value": 'max', 'RT': 'mean', 'Precursor_Mz': 'mean', 'Precursor_Quantity': 'sum', "proForma_key": list, "Modified_Sequence": list})
        self.dictCols['proForma_key'] = self.dictCols['proForma_key'].str.join(" ")
        self.dictCols['Modified_Sequence'] = self.dictCols['Modified_Sequence'].str.join(" ")
        print(self.dictCols.shape[0])

    def getUniqueSequences(self):
        return self.dictCols['Stripped_Sequence'].unique().tolist()

    def getDF(self):
        return self.dictCols

    def getPeptideJInfo(self):
        self.dictCols['peptideJ'] = self.dictCols["Stripped_Sequence"].apply(re.sub, pattern=r"LI", repl="J")
        return self.dictCols[["Stripped_Sequence", "peptideJ"]]
