#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Created on 05 October 2020
# @authors: Niklas Siedhoff, Alexander-Maurice Illig
# @contact: <n.siedhoff@biotec.rwth-aachen.de>
# PyPEF - Pythonic Protein Engineering Framework
# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0)
# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode

# PyPEF – An Integrated Framework for Data-Driven Protein Engineering
# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476
# https://doi.org/10.1021/acs.jcim.1c00099
# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,*
# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany
# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany
# *Corresponding author
# §Equal contribution


# docstring for argument parsing using docopts
"""
PyPEF - Pythonic Protein Engineering Framework

Modeling options
----------------
    I. Pure ML modeling
    -------------------
        PyPEF provides three encoding options for training machine learning models, i.e.
        regression models trained by supervised learning:

            1. DCA: Direct coupling analysis (DCA) based on evolutionary couplings (input
                       coupling parameter file generated by the C framework PLMC).

            2. AAidx: Based on AAindex descriptors (566 amino acid descriptor files
                       taken from the AAindex database).

            3. OneHot: One-hot encoding representing the occurrence of an
                       amino acid at a sequence position as a single 1 and 19 0's.

        Any encoding technique enables pure ML-based modeling, see
        https://doi.org/10.1021/acs.jcim.1c00099
        and DCA-based sequence encoding enables a hybrid modeling approach, see
        https://doi.org/10.1101/2022.06.07.495081

        If an MSA can be constructed for the target sequence, e.g. using Jackhmmer,
        encoding option 1 will likely outperform encoding option 2.
        However, encoding option 2 provides a static encoding technique that is
        independent from the evolutionary history of a target sequence and
        without the need for MSA construction.
        Here, the AAidx encodings for modeling are compared, i.e. validated, with respect
        to their performance on the test set (comparable to an hyperparameter search
        for finding the best static encoding set for model inference).
        Further, one-hot encoding (encoding option 3) provides a simple but fast and often
        well-performing encoding option that will likely outperform the AAindex-based
        technique for model generalization.

    II. Hybrid modeling
    -------------------
        Constructing a hybrid model that combines pure statistical DCA-based prediction (a
        variant's relative 'evolutionary energy' to the wild type) and DCA-encoding based
        training of a ML model similar to pure ML modeling option I.1.
        Based on features generated from the direct coupling analysis (.params file output
        by PLMC). Individual model contributions are optimization only based on Spearman's
        correlation coefficient and thus, only variant fitness ranks are to be considered
        for evaluating model performance, not the exact predicted fitness value. For regression,
        only L2-regularized linear regression (Ridge regression) is provided as modeling option.


Running example of training, testing, and inferring a pure ML model for predictions
-----------------------------------------------------------------------------------
Exemplary running of PyPEF for training a pure ML model using encoding option 2
based on features generated from the AAIndex database (566 amino acid descriptor
indices taken from the AAIndex database).
 1. Create files for training and testing from variant-fitness CSV data:
       pypef mklsts -i variant_and_fitness.csv -w wt_sequence.fasta
 2. Train and validate models:
        pypef ml -e aaidx -l LS.fasta -t TS.fasta --regressor pls
 3. Plot the test set entries against test set predictions (creates PNG figure, MODEL12345 is
    the chosen AAindex, e.g. FAUJ880104):
        pypef ml -e aaidx -m MODEL12345 -f TS.fasta --label
 4. Create files for prediction:
    - Single file:
        pypef mkps -w wt_sequence.fasta -i variant_fitness.csv
    - Recombinant/diverse prediction files:
        pypef mkps -w wt_sequence.fasta -i variant_fitness.csv
            [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
            [--ddiverse] [--tdiverse] [--qdiverse]
 5. Predict (unknown/new) variants:
    - Single file:
        pypef ml -e aaidx -m MODEL12345 -p Prediction_Set.fasta
    - Recombinant/diverse prediction files in created prediction set folders:
        pypef ml -e aaidx -m MODEL12345 --pmult [--drecomb] [...] [--qdiverse]
    - Directed evolution – for performing and plotting in silico evolution trajectories:
        pypef ml -e aaidx directevo -m MODEL12345 [...]
Note: The commands for hybrid modeling are very similar to the commands for pure ML modeling,
see pypef [-h] for possible commands.


Additional helpful commands for data conversion
-----------------------------------------------
Creation of learning and test sets – splitting CSV variant-fitness data:
        pypef mklsts --wt WT_SEQ --input CSV_FILE
            [--drop THRESHOLD] [--numrnd NUMBER]

Creation of prediction sets from CSV data (using single-substituted variant data):
        pypef mkps --wt WT_SEQ --input CSV_FILE
            [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
            [--ddiverse] [--tdiverse] [--qdiverse]

Encoding a CSV file (for further performance studies such as "low N" or
"mutational extrapolation" engineering tasks:
        pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_SEQ
            [--params PLMC_FILE] [--y_wt WT_FITNESS] [--model MODEL12345] [--nofft]
            [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY]

Converting a STO alignment file to A2M format:
        pypef sto2a2m --sto STO_MSA_FILE
            [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP]


Usage:
    pypef mklsts --wt WT_SEQ --input CSV_FILE
        [--drop THRESHOLD] [--numrnd NUMBER]
    pypef mkps --wt WT_SEQ --input CSV_FILE
        [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
        [--ddiverse] [--tdiverse] [--qdiverse]
    pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_SEQ
        [--params PLMC_FILE] [--y_wt WT_FITNESS] [--model MODEL12345] [--nofft]
        [--threads THREADS]
        [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY]
    pypef shift_pos --input CSV_FILE --offset OFFSET
        [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY]
    pypef sto2a2m --sto STO_MSA_FILE [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP]
    pypef hybrid --ls LEARNING_SET --ts TEST_SET --params PLMC_FILE
        [--label] [--threads THREADS]
    pypef hybrid --model MODEL --params PLMC_FILE
        [--ts TEST_SET]
        [--figure TS_FOR_PLOTTING] [--label]
        [--ps PREDICTION_SET] [--pmult] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
                                        [--ddiverse] [--tdiverse] [--qdiverse] [--negative]
        [--threads THREADS]
    pypef hybrid directevo --wt WT_SEQ --model MODEL12345 --params PLMC_FILE
        [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER]
        [--numtraj NUM_TRAJ] [--temp TEMPERATURE]
        [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD]
    pypef hybrid train_and_save --input CSV_FILE --params PLMC_FILE
        [--fit_size REL_LEARN_FIT_SIZE] [--test_size REL_TEST_SIZE]
        [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR]
        [--fitness_key FITNESS_KEY] [--rnd_state RND_STATE]
    pypef hybrid low_n --input ENCODED_CSV_FILE
    pypef hybrid extrapolation --input ENCODED_CSV_FILE
        [--conc]
    pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET
        [--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PLMC_FILE]
        [--sort METRIC_INT] [--threads THREADS] [--color]
    pypef ml --show
        [MODELS]
    pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --figure TS_FOR_PLOTTING
        [--label] [--color] [--y_wt WT_FITNESS] [--nofft] [--params PLMC_FILE] [--threads THREADS]
    pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --ps PREDICTION_SET
        [--params PLMC_FILE] [--threads THREADS] [--nofft] [--negative]
    pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --pmult
        [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
        [--ddiverse] [--tdiverse] [--qdiverse]
        [--regressor TYPE] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS]
    pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL12345 --wt WT_SEQ
        [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE]
        [--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PLMC_FILE]
    pypef ml low_n --input ENCODED_CSV_FILE
        [--regressor TYPE]
    pypef ml extrapolation --input ENCODED_CSV_FILE
        [--regressor TYPE] [--conc]


Options:
  --all                             Finally training on all data [default: False].
  --color                           Color the plot for "true" and "false" predictions
                                    quarters [default: False].
  --conc                            Concatenating mutational level variants for predicting variants
                                    from next higher level [default: False].
  --csvaa                           Directed evolution csv amino acid substitutions,
                                    requires flag "--usecsv" [default: False].
  --ddiverse                        Create/predict double natural diverse variants [default: False].
  --drecomb                         Create/predict double recombinants [default: False].
  -d --drop THRESHOLD               Below threshold variants will be discarded from the
                                    data [default: -9E09].
  -e --encoding ENCODING_TECHNIQUE  Sets technique used for encoding sequences for constructing regression models;
                                    choose between 'aaidx' (AAIndex-based encoding), 'onehot' (OneHot-based encoding),
                                    and 'dca' (DCA-based encoding) [default: aaidx].
  -f --figure VS_FOR_PLOTTING       Validation set for plotting using a trained model.
  --fitness_key FITNESS_KEY         Label of CSV fitness column. Else uses second column.
  -h --help                         Show this screen [default: False].
  -i --input CSV_FILE               Input data file in .csv format.
  --inter_gap INTER_GAP             Fraction to delete all positions with more than
                                    'inter_gap' * 100 % gaps (columnar trimming) [default: 0.3].
  --intra_gap INTRA_GAP             Fraction to delete all sequences with more than
                                    'intra_gap' * 100 % gaps after being columnar trimmed
                                    (line trimming) [default: 0.5].
  --label                           Label the plot instances [default: False].
  -l --ls LEARNING_SET              Input learning set in .fasta format.
  -m --model MODEL12345             Model (pickle file) for plotting of validation or for
                                    performing predictions.
  --mutation_sep MUTATION_SEP       Mutation separator [default: /].
  --mutation_extrapolation          Mutation extrapolation [default: False].
  --negative                        Set if more negative values define better variants [default: False].
  --nofft                           Raw sequence input, i.e., no FFT for establishing protein spectra
                                    as vector inputs, only implemented as option for AAindex-based
                                    sequence encoding [default: False].
  -n --numrnd NUMBER                Number of randomly created Learning and Validation
                                    datasets [default: 0].
  --numiter NUM_ITER                Number of mutation iterations per evolution trajectory [default: 5].
  --numtraj NUM_TRAJ                Number of trajectories, i.e., evolution pathways [default: 5].
  -o --offset OFFSET                Offset for shifting substitution positions of the input CSV file [default: 0].
  --params PLMC_FILE                Input PLMC couplings parameter file.
  -u --pmult                        Predict for all prediction files in folder for recombinants
                                    or for diverse variants [default: False].
  -p --ps PREDICTION_SET            Prediction set for performing predictions using a trained Model.
  --qdiverse                        Create quadruple natural diverse variants [default: False].
  --qarecomb                        Create/predict quadruple recombinants [default: False].
  --qirecomb                        Create/predict quintuple recombinants [default: False].
  --regressor TYPE                  Type of regression (R.) to use, options: PLS CV R.: pls,
                                    PLS LOOCV R.: pls_loocv, Random Forest CV R.: rf, SVM CV R.: svr,
                                    MLP CV R.: mlp, Ridge CV R.: ridge (or l2),
                                    LassoLars CV R.: lassolars (or l1) [default: pls].
  --rnd_splits RND_SPLITS           Number of random splits for Low N testing [default: 5].
  --rnd_state RND_STATE             Sets the random state for reproduction, only implemented
                                    for hybrid train_and_save [default: 42].
  -s --save NUMBER                  Number of models to be saved as pickle files [default: 5].
  --sep CSV_COLUMN_SEPARATOR        CSV Column separator [default: ;].
  --show                            Show achieved model performances from Model_Results.txt.
  --sort METRIC_INT                 Rank models based on metric {1: R^2, 2: RMSE, 3: NRMSE,
                                    4: Pearson's r, 5: Spearman's rho} [default: 1].
  --sto STO_MSA_FILE                The input MSA file in STO (Stockholm) format.
  --tdiverse                        Create/predict triple natural diverse variants [default: False].
  --temp TEMPERATURE                "Temperature" of Metropolis-Hastings criterion [default: 0.01]
  --threads THREADS                  Parallel computing of training and validation of models.
                                    Number of threads used in parallel computing, by default
                                    no hyperthreading.
  --fit_size REL_LEARN_FIT_SIZE     Relative size of the train set for initial fitting. The remaining data
                                    for training is used for hyperparameter optimization on train subsets
                                    used for validation, while in sum the total data for training is
                                    training data = train_fit data + train_test(validation) data
                                                  = all data - test data.
                                    The default of 0.66 means that 34 % of the train data is taken for
                                    train_test validation [default: 0.66].
  --test_size REL_TEST_SIZE         Relative size of the test set; if set to 0.0 the trained model
                                    will not be tested [default: 0.2].
  --trecomb                         Create/predict triple recombinants [default: False].
  --usecsv                          Perform directed evolution on single variant csv position
                                    data [default: False].
  -t --ts TEST_SET                  Input validation set in .fasta format.
  --version                         Show version [default: False].
  -w --wt WT_SEQ                    Input file (in FASTA format) for wild-type sequence.
  --wt_pos WT_POSITION              Row position of encoded wild-type in encoding CSV file (0-indexed) [default: 0].
  -y --y_wt WT_FITNESS              Fitness value (y) of wild-type [default: 1.0].
  encode                            Encoding [default: False].
  hybrid                            Hybrid modeling based on DCA-derived sequence encoding [default: False].
  ml                                Pure machine learning modeling based on encoded sequences [default: False].
  MODELS                            Number of saved models to show [default: 5].
  onehot                            OneHot-based encoding [default: False].
  shift_pos                         Shift positions of all variant substitutions of the input CSV
                                    file [default: False.]
  sto2a2m                           Transform multiple sequence alignment from STO format to
                                    A2M format [default: False].
"""


from docopt import docopt
from schema import Schema, SchemaError, Optional, Or, Use

from pypef import VERSION
from pypef.ml.run import run_pypef_pure_ml
from pypef.dca.run import run_pypef_hybrid_modeling
from pypef.utils.run import run_pypef_utils


schema = Schema({
    # '<name>': str,
    Optional('--all'): bool,
    Optional('--color'): bool,
    Optional('--conc'): bool,
    Optional('--csvaa'): bool,
    Optional('--ddiverse'): bool,
    Optional('--drecomb'): bool,
    Optional('--drop'): Use(float),
    Optional('--encoding'): Use(str),
    Optional('--figure'): Or(None, str),
    Optional('--fitness_key'): Or(None, str),
    Optional('--fit_size'): Use(float),
    Optional('--help'): bool,
    Optional('--input'): Or(None, str),
    Optional('--inter_gap'): Use(float),
    Optional('--intra_gap'): Use(float),
    Optional('--label'): bool,
    Optional('--ls'): Or(None, str),
    Optional('--model'): Or(None, str),  # str, Or(None, str),
    Optional('--mutation_sep'): Or(None, str),
    Optional('--negative'): bool,
    Optional('--nofft'): bool,
    Optional('--numrnd'): Use(int),
    Optional('--numiter'): Use(int),
    Optional('--numtraj'): Use(int),
    Optional('--offset'): Use(int),
    Optional('--params'): Or(None, str),
    Optional('--pmult'): bool,
    Optional('--ps'): Or(None, str),
    Optional('--qdiverse'): bool,
    Optional('--qarecomb'): bool,
    Optional('--qirecomb'): bool,
    Optional('--regressor'): Or(None, str),
    Optional('--rnd_splits'): Use(int),
    Optional('--rnd_state'): Use(int),
    Optional('--save'): Use(int),
    Optional('--sep'): Or(None, str),
    Optional('--show'): Use(int),
    Optional('--sort'): Use(int),
    Optional('--sto'): Or(None, str),
    Optional('--tdiverse'): bool,
    Optional('--temp'): Use(float),
    Optional('--test_size'): Use(float),
    Optional('--threads'): Or(None, Use(int)),
    Optional('--train_size'): Use(float),
    Optional('--trecomb'): bool,
    Optional('--usecsv'): bool,
    Optional('--ts'): Or(None, str),
    Optional('--wt'): Or(None, str),
    Optional('--wt_pos'): Use(int),
    Optional('--y_wt'): Or(None, Use(float)),
    Optional('aaidx'): bool,
    Optional('hybrid'): bool,
    Optional('directevo'): bool,
    Optional('encode'): bool,
    Optional('extrapolation'): bool,
    Optional('low_n'): bool,
    Optional('mklsts'): bool,
    Optional('mkps'): bool,
    Optional('ml'): bool,
    Optional('MODELS'): Or(None, Use(int)),
    Optional('onehot'): bool,
    Optional('shift_pos'): bool,
    Optional('sto2a2m'): bool,
    Optional('train_and_save'): bool,
})


def validate(args):
    try:
        args = schema.validate(args)
        return args
    except SchemaError as e:
        exit(e)


def run_main():
    arguments = docopt(__doc__, version=VERSION)
    # print(arguments)  # uncomment line for printing argument dict
    arguments = validate(arguments)
    if arguments['directevo']:
        run_pypef_utils(arguments)
    elif arguments['ml']:
        run_pypef_pure_ml(arguments)
    elif arguments['hybrid']:
        run_pypef_hybrid_modeling(arguments)
    else:
        run_pypef_utils(arguments)


if __name__ == '__main__':
    run_main()
