#   TIDD: peptide rescoring s/w
#
#   Written by H. Li <hllee@hanyang.ac.kr>
#
#   Copyright (C) 2022 BIS Labs, Hanyang Univ. Korea
#
#   TIDD is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   TIDD is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#   
#   You should have received a copy of the GNU General Public License
#   along with TIDD.  If not, see <http://www.gnu.org/licenses/>.

library(e1071);
library(ROCR);

last_svmprob_file <- "last_svmprob.tsv"
data<-read.delim("{{ feature_file }}");
psm_feature_file <- "{{ result_file }}";
train_size <- {{ train_size }};
iteration <- {{ iteration }};
k_cross <- {{ k_cross }};

paste("# instances: ", nrow(data))

dec_id<-which(data[,"TorD"]=="D")

if (length(dec_id) < train_size) {
    stop (paste("too few decoy PSMs (",length(dec_id),") to perform TIDD analysis. TIDD needs at least ",train_size," (train_size) decoy PSMs",sep = ""))
}

check1 <- c("Charge",	"PrecursorM",	"CalMW",	"DeltaMass",	"AbsDeltaMass",	"PeptideLength", 	"SpecOmsScore",	"SpecFitScore",	"SpecOmsMatchType",	"SpecOmsCandidateStatus",	"TIC","MaxIntALL",	"MaxYionInt",	"MaxBionInt",	"SumYmatchInt",	"SumBmatchInt",	"FracYmatchInt",	"FracBmatchInt",	"SeqCoverYion",	"SeqCoverBion",	"ConsecutiveYion",	"ConsecutiveBion",	"MassErrMean",	"MassErrSD",	"NumofAnnoPeaks",	"NumofComplementPeaks",	"SumComplementPeaksInt",	"FracComplementPeaksInt",	"SeqCoverComplementPeaks","IsotopeRatioVectorSize","IsotopeRatioCoefficientOfDetermination","Hyperscore",	"TorD")

if (max(data$SpecOmsScore) == 0) {
# withiut SpecOMS

check1 <- c("Charge",	"PrecursorM",	"CalMW",	"DeltaMass",	"AbsDeltaMass",	"PeptideLength", "TIC","MaxIntALL",	"MaxYionInt",	"MaxBionInt",	"SumYmatchInt",	"SumBmatchInt",	"FracYmatchInt",	"FracBmatchInt",	"SeqCoverYion",	"SeqCoverBion",	"ConsecutiveYion",	"ConsecutiveBion",	"MassErrMean",	"MassErrSD",	"NumofAnnoPeaks",	"NumofComplementPeaks",	"SumComplementPeaksInt",	"FracComplementPeaksInt",	"SeqCoverComplementPeaks","IsotopeRatioVectorSize","IsotopeRatioCoefficientOfDetermination","Hyperscore",	"TorD")

}

data$IdentificationEngine<-as.factor(data$IdentificationEngine)
if (nlevels(data$IdentificationEngine) > 1) {
   append(check1,"IdentificationEngine")
}

init_score <- "Hyperscore"
IDs<-rep(0,as.numeric(iteration)+1)
IDs_name<-rep(0,as.numeric(iteration)+1)

scores<-unique(round(data[dec_id,init_score],3))
Results_FDR<- -1000
score_threshold<-1000
CIDS<-0


#  print(length(scores))

for(k in 1:length(scores))
{
  TP<-which(data[,init_score]>=scores[k] & data[,"TorD"]=="T")
  FP<-which(data[,init_score]>=scores[k] & data[,"TorD"]=="D")
  
  FDR=length(FP)/length(TP)
  
  if((0.01-FDR)>=0 & abs(0.01-FDR)<=abs(0.01-Results_FDR))
  {
    
    if(CIDS<length(TP))
    {
      Results_FDR<-FDR
      score_threshold<-scores[k]
      CIDS<-length(TP)
    }
  }
}

paste("Init ID:",CIDS)
IDs[1]<-CIDS 
IDs_name[1]<-"Init"
tar_candidate_id<-which(data[,init_score]>=scores[k] & data[,"TorD"]=="T")

ndata<-data[,check1]
ndata[,"TorD"]<-as.factor(as.character(ndata[,"TorD"]))

if("Charge" %in% check1)
{
 # id<-which(ndata$Charge>=4)
  
#  ndata[id,]$Charge<-4
  
  ndata$Charge<-as.factor(ndata$Charge)
}

if("Triptic" %in% check1)
{
  ndata$Triptic<-as.factor(ndata$Triptic)
}

if("SpecOmsMatchType" %in% check1)
{
  ndata$SpecOmsMatchType<-as.factor(ndata$SpecOmsMatchType)
}

if("SpecOmsCandidateStatus" %in% check1)
{
  ndata$SpecOmsCandidateStatus<-as.factor(ndata$SpecOmsCandidateStatus)
}

if("IdentificationEngine" %in% check1)
{
  ndata$IdentificationEngine<-as.factor(ndata$IdentificationEngine)
}
if("ConsecutiveYion" %in% check1)
{
  ndata$ConsecutiveYion<-as.factor(ndata$ConsecutiveYion)
}
if("ConsecutiveBion" %in% check1)
{
  ndata$ConsecutiveBion<-as.factor(ndata$ConsecutiveBion)
}
	


      print(summary(ndata))

#      print(typeof(ndata[,1]))
#      print(typeof(ndata[,2]))

colnames(ndata)[which(names(ndata)=="TorD")]<-"TorD"

  for(iteration in 1: as.numeric(iteration))
  {
    
    tar_c<-sample(tar_candidate_id,as.numeric(train_size),replace=FALSE)
    dec_c<-sample(dec_id,as.numeric(train_size),replace=FALSE)
    
    CV<-as.numeric(k_cross)
    cv.error<-rep(0,CV)
    auc.error<-rep(0,CV)
    
    x_tar<-ndata[tar_c,]
    x_dec<-ndata[dec_c,]
    
    set.seed(11)
    t_fold<-cut(seq(1,nrow(x_tar)),breaks=CV,labels=FALSE)
    d_fold<-cut(seq(1,nrow(x_dec)),breaks=CV,labels=FALSE)
    
    for(i in 1:CV)
    {
      
      t_testid<-which(t_fold==i,arr.ind=TRUE)
      d_testid<-which(d_fold==i,arr.ind=TRUE)
      
      Tdata<-rbind(x_tar[t_testid,],x_dec[d_testid,])
      TrainData<-rbind(x_tar[-t_testid,],x_dec[-d_testid,])
      
      set.seed(1)
      svm.fit=svm(formula = TrainData$TorD~ ., data = TrainData, probability=TRUE, kernel="linear", type="C-classification")
      
      pred.prob<-predict(svm.fit,Tdata,probability=TRUE)
      pred_1<-prediction(attr(pred.prob,"probabilities")[,1],Tdata$TorD)
      perf<-performance(pred_1,"auc")
      auc.error[i]<-as.numeric(as.character(unlist(perf@y.values[[1]])))
    }
    
    #          print(auc.error)
    SiM<-100
    index<-1
    meanAUC<-mean(auc.error)
    for(k in 1:CV)
    {
      if(abs(auc.error[k]-meanAUC)<SiM)
      {
        SiM<-abs(auc.error[k]-meanAUC)
        index<-k
      }
    }
    #########TEST
    t_testid<-which(t_fold==index,arr.ind=TRUE)
    d_testid<-which(d_fold==index,arr.ind=TRUE)
    
    TrainData<-rbind(x_tar[-t_testid,],x_dec[-d_testid,])
    svm.fit=svm(formula = TrainData$TorD ~ ., data = TrainData, type = "C",probability=TRUE, kernel="linear")
    pred.prob<-predict(svm.fit,ndata,probability=TRUE)
    
    pred_1<-prediction(attr(pred.prob,"probabilities")[,1],ndata$TorD)
    newTestData<-cbind(ndata,attr(pred.prob,"probabilities")[,1])   
    colnames(newTestData)[ncol(newTestData)]<-c(paste0 ("SVM_Prob_",iteration))
    
    write.table(newTestData,psm_feature_file,sep="\t")
    write(newTestData[,paste0 ("SVM_Prob_",iteration)], file=last_svmprob_file)
    scorename<-colnames(newTestData)[ncol(newTestData)]
    
    scores<-unique(round(newTestData[dec_id,scorename],3))
    length(scores)
    #tmpdata<-cbind(data$Xcorr,data$TorD)
    
    Results_FDR<- -1000
    score_threshold<-1000
    CIDS<-0
    
    for(k in 1:length(scores))
    {
      
      TP<-which(newTestData[,scorename]>=scores[k] & newTestData[,"TorD"]=="T")
      FP<-which(newTestData[,scorename]>=scores[k] & newTestData[,"TorD"]=="D")
      
      if(length(TP)>0)
      {
        FDR=length(FP)/length(TP)
        
        if(0.01-FDR>=0 & abs(0.01-FDR)<=abs(0.01-Results_FDR))
        {
          
          if(CIDS<-length(which(newTestData[,scorename]>=scores[k] & newTestData[,"TorD"]=="T")))
          {
            Results_FDR<-FDR
            score_threshold<-scores[k]
            CIDS<-length(which(newTestData[,scorename]>=scores[k] & newTestData[,"TorD"]=="T"))
          }
        }
      }
      
    }
    
    message(paste(paste("Results_FDR=",Results_FDR)))
    
    message(paste(paste("score_threshold=",score_threshold)))
    
    tar_candidate_id<-which(newTestData[,scorename]>=score_threshold & newTestData[,"TorD"]=="T")
    message(paste(paste("iteration ",iteration),": ", length(tar_candidate_id)))
    IDs[(iteration+1)]<-length(tar_candidate_id)
    IDs_name[(iteration+1)]<-paste0("Iter",iteration)
    
  }
  
  
#  barplot(IDs, main="Identified PSMs", names=IDs_name,xlab="Model", ylab="# of identified PSMs")
  

