import time
import sys
from Bio import SeqIO
import pandas as pd
import re
from bisect import bisect_left

# https://stackoverflow.com/questions/60936318/speed-up-regex-matching-in-python


class fastaSearch():

    def __init__(self):
        # pas sur que la classe soit utile
        self.fasta = {"accession": [], "sequences": []}
        self.accessions = dict()
        self.peptides = {"pep": [], "pepreg": []}

    def addSequences(self, fasta):
        with open(fasta) as handle:
            nb_seq = 0
            for record in SeqIO.parse(handle, "fasta"):
                if record.id not in self.fasta["accession"]:
                    self.accessions[record.id] = {"accession": record.id,
                                                  "fullDescr": f'{record.id}' +
                                                  f'{record.description}',
                                                  "sequence": str(record.seq)}
                    fullseqLI = str(record.seq).replace("L", "I")
                    self.fasta["accession"].append(record.id)
                    self.fasta["sequences"].append(fullseqLI)
                    nb_seq = nb_seq+1
                else:
                    print(f"accession {record.id} is already present in " +
                          "fasta files check your databases")
        print(f"fasta file {fasta}, contains" +
              f"{nb_seq} non duplicated sequences")

    def addPeptides(self, seqs):
        for seq in seqs:
            self.peptides["pep"].append(seq)
            self.peptides["pepreg"].append(seqs[seq])
        #self.peptides["pepreg"] = [re.compile(r)
                 #                  for r in self.peptides["pepreg"]]

    def __sequenceIndex(self):
        self.concatSequences = "\n".join(self.fasta["sequences"])
        self.indexSequences = [i for i, c in enumerate(self.concatSequences)
                               if c == "\n"]

    def searchSeq(self):
        self.__sequenceIndex()
        start = time.time()
        result = []
        count = 0
        print("start search")
        for j, expr in enumerate(self.peptides["pepreg"]):
            previ = -1                                # finditer may find multiple occurrences on same line
            for m in re.finditer(expr, self.concatSequences):          # go through all occurrences
                i = bisect_left(self.indexSequences, m.start())  # determine line number
                if i == previ:
                    continue
                previ = i
                result.append((i, j))
            count = count + 1
            if count % 1000 == 0:
                stop = time.time()
                sys.stdout.write(f"* {count} peptides searched in " +
                                 f"{stop-start} seconde\n")
                sys.stdout.flush()
                start = time.time()
            elif count % 100 == 0:
                sys.stdout.write("*")
                sys.stdout.flush()

    def searchSeq2(self):
        pepgreg_concate = ""
        # loop to select pepregs by 1000
        # determine the number of group
        nb_pepreg = len(self.peptides["pepreg"])
        nb_slice = int(nb_pepreg/1000)+1
        print(f"Number of slice {nb_slice}")
        start = time.time()
        for i in range(nb_slice):
            subpepregs = self.peptides["pepreg"][(i-1)*1000:i*1000]
            subpepregs = "("+")|(".join(subpepregs)+")"
            # print(subpepregs)
            subpepregs = re.compile(subpepregs)
            for seq in self.fasta["sequences"]:
                r = re.findall(subpepregs, seq)
                if len(r) != 0:
                    print(self.fasta["accession"][self.fasta["sequences"].index(seq)])
                    # print(r)
            print(f"1000 sequence in {time.time()-start} secondes")
            start = time.time()
        # retrieve slice of pepgregs



