# cd_hit_clstr_file = '/lustre/home/xuyuxing/Database/Cuscuta/Cau/genomev1.1/centromere/Cau_patten.fasta.cd-hit.clstr'
from config import cd_hit_dir_path
from lib.common.os import mkdir, rmdir, cmd_run
import uuid
import os
import sys
import re
import string
from lib.common.genome.seq_base import read_fasta_big
from lib.xuyuxing.base import base_function as bf


def psi_cd_hit(input_file, output_prefix, wd, c=0.8):
    psi_cd_hit_path = cd_hit_dir_path + "/psi-cd-hit/psi-cd-hit.pl"
    cmd_string = '%s -i %s -o %s -c %.1f -G 1 -g 1 -prog blastn -exec local -para 8 -blp 4' % (
        psi_cd_hit_path, input_file, output_prefix, c)
    print(cmd_string)
    cmd_run(cmd_string, cwd=wd, silence=True)


def read_cd_hit_cluster(cd_hit_clstr_file):
    cluster_dir = {}
    with open(cd_hit_clstr_file, 'r') as f:
        all_text = f.read()
        all_text = re.sub('\r\n', '\n', all_text)
        # info = string.split(all_text, '>') python2
        info = all_text.split('\n>')
        while '' in info:
            info.remove('')
        for i in info:
            # seq = string.split(i, '\n', 1) python2
            seq = i.split('\n', 1)
            cluster_name = re.sub('>', '', seq[0])
            cluster_dir[cluster_name] = []
            cluster_info_detail = seq[1].split('\n')
            while '' in cluster_info_detail:
                cluster_info_detail.remove('')
            for info_clster in cluster_info_detail:
                ID, length, seq_name, detail = re.search(
                    r'^(\d+)\t(\d+)aa, >(\S+)\.\.\.(.*)$', info_clster).groups()
                cluster_dir[cluster_name].append(seq_name)
    return cluster_dir


def cd_hit_runing(seq_dict, args_string='-c 0.9 -n 5 -M 800 -d 0 -T 1', tmp_dir='/tmp'):
    """
    seq_dict = {
        'seq_name1': 'AA_SEQ'
    }

    seq_dict = {"T13333N0C0001G00020_0": "RYHHPSRRCPAAVAAAASHRRRPAAAASRRHRPAVAASSRRRCQPPLAAAAPASRSCCRCPCRR", 
                "T13333N0C0001G00117_0": "FDEMPKKDVVLYNCVIDGHAKKGSLDRCLVLLQQMKGEGTRPNAATFVGLLALCASSGSLGIGRRIHELVENEKLEMEVSLGTALVDMYAKNGCLEEAVLVFRKVRNPDVKAWTAMIMGFAVHGRSEDALRLSDEMEESGLRPNEVTLLAVLSACCHGGLVNKGKEYFKRIIRRRISLDLITFNTVINGFCKFGKLRIAVDLLTEMRSWGVEPSVVTYNTLIGGFCKSNQGGKLYRAEALLRKMVSAGIYPNAVTFNTLIDGFCKDSNSSAGFKFLEEMKRQEVSPNTVSYNTLINGLCNEGKMEEAVALLRQMESKDVIPNVVTFNTFINGFCKKGMLGDAKRFLDDIFDRGLISSTITYNTLIDGYCRLGQMMDALELKGLMLEKGISLDVSTYNCLILGFCEKGDMKAASCLLDEMAKKGIQADVITYNILISALCGNGEFKKASKLLDEMFDVGLKPSHLTYNTLMEGFCKKGNLTAALNIKARMEGNKKHANVATYNILVKCLCYKG", 
                "T13333N0C0001G00117_1": "FDEMPKKDVVLYNCVIDGHAKKGSLDRCLVLLQQMKGEGTRPNAATFVGLLALCASSGSLGIGRRIHELVENEKLEMEVSLGTALVDMYAKNGCLEEAVLVFRKVRNPDVKAWTAMIMGFAVHGRSEDALRLSDEMEESGLRPNEVTLLAVLSACCHGGLVNKGKEYFKRIIRRRISLDLITFNTVINGFCKFGKLRIAVDLLTEMRSWGVEPSVVTYNTLIGGFCKSNQGGKLYRAEALLRKMVSAGIYPNAVTFNTLIDGFCKDSNSSAGFKFLEEMKRQEVSPNTVSYNTLINGLCNEGKMEEAVALLRQMESKDVIPNVVTFNTFINGFCKKGMLGDAKRFLDDIFDRGLISSTITYNTLIDGYCRLGQMMDALELKGLMLEKGISLDVSTYNCLILGFCEKGDMKAASCLLDEMAKKGIQADVITYNILISALCGNGEFKKASKLLDEMFDVGLKPSHLTYNTLMEGFCKKGNLTAALNIKARMEGNKKHANVATYNILVKCLCYKG", 
                "T13333N0C0001G00117_2": "LHSMLKIDCVLNAHVYTPILQAYCAQGKLHLAAQLLERMRAFGCPPDVVSYNVLIHALCKNGNFQDVDRILAESSENGWEPDSVSYNTYISGLCSMGMTDKALKQLDVMLSKALHPTDVTLNILLGLLCRRSNFVEIVFFLEKSLELGLPVGAATYNTAMTRLCEIGEFSAVLNLLSDMFKKRIQPDVQTFNIVILSFCRAGRIDKAKSFFKSRGFSPDIVTCNILLHEFCLAGEFDEFYKMLAEMSSGQVSPDAITYNIIVDGLCWDGKYSKAIEFVKSINFEFKSDHTPDPLCYSAAMDTLIRAGRCREAEAVFSRMLSSGISPDSVTYSILVNMHCFHRKHFASALHLIELMRDRGCEPDVVTYSTVIAGLSRSGKLDEAFSVLHSMLKTDCVLNAHAYTPILQVYCAQGKLHLAAQLLERMRAFGCPPDVVSYNVLIRALCKNGNFQAVDRILAESSENGWEPDSISYNTYISGLCNMGMTDKALKQLDVMLSKALHPTDVTLNILLGLLCRR", 
                "T13333N0C0001G00117_3": "MEGMIKKGREPDVVSYTTIISGLCDNRRFTAAVEIWDEMWKRGIKPDVITCGSLIFGLCHGKVDVAYEIMMEAIKNNCDLSVPIYNAVINGFCRAGRVDKAKSIMALMRRNGLKLDLTTCNILLNYWCKAGMLDQAEKVIQYMKHSGINPDSYSYNQLLKGLCDTKRIEKAYKLFVNTIETEGMCDIISCNILIDGLCRARRTSSAHKLFRELDLKGIKPDTVTYGTLINGFLIAGNDKFAKELFDEMFKAGVVPNANAYSIMIHYFCKSGQIELACSYFYDMIRGGSSPDAVSYSIIINGLLRAHRVNEAMAFYEDMWNRGVIPDDGTYKLLIGGLLMENKVLLAHMIWDNMMEKGVTLDKTVSERLVEAL", 
                "T13333N0C0001G00117_4": "PSVFTYNIMIDSLCKDGDVRSAEELFCKMKQRSCSPDTVTFNSLIDGHGKCGELEEAASLLKEMMFHGCIPDLITYNALINCFCKFGRLHGAFGYFNEMRRNGLKPNVVTYSTFVDALCKKDMMQMATKFFIDMRMRGLRPNEFTFACLIDGNCKAGNLQEALLLLEEMLQDGIELNVVTYSILVDGLCKNGFLREAEEVLQTLVKSGMPMNQIPYTALIHAHFKNKNIKRAVNLMNEMRENGHKPDLSAYGTAIWGLSNEGKFDDAEVLLEEIDKCGLKPNCVIYTALLVANFKSGNTSMALNFFHKMQDLGISPTVVTYGALVDGLCKQGSVHEAFYHFNKLRDLGLQPNVLTYTSLVDRLCKDGCLEEASKLFEEMVGNGISPDKVAYTSLMHCYMKRGNLQEAFVLRDKMIENGFKLDLCAYNTIIWGLC", 
                "T13333N0C0001G00117_5": "FQQVARRLIPPDVNTYNTLLHGLIKEKNMERAVELWKHMDQGFFQPNLVSYTILIDGYCKIGRVEEASGLLNEMNNKGVFPDVVVYSALINGFCNVGTVDRGLQVFKEMGEMGILPNVVTFSCLINALGKAGRWVVVRELLNKMVDLGIQPDVFAYTGLVDGLCKEGKEIEAIKMLGLMTERGVEPSTVTYNVLINHLCKKGMIDDAIAFMKKMSGKGKVPDVVTFNTLILGLCKANRMEVAIKQMNAMLEGKHGVEPDVVTFNTIMHWLCKRGEVSQALELRKMMEERGCDFNLMTYNILIGGLIKVHKLNEAMKLQKEMADKGLRPNSFTYSTMINELSKLGKFDDAERFLHEMEDRGLSPSLMDYNTVTYTILINAVCRSRNLREATRLLGVLKHSGIMPDCYLYNCIMKAHCLLDQCWEVIEVYNRMRDEGIEPDLVTYNTLIYGLSKAGMIDQAKKLLNVMADTGYFPDVASYTSLIGGMCRKGDASGALLLLEEMEGRGCSPNECTYNALLMGLCKAGSLDKGMQ",
                "T13333N0C0001G00117_6": "LHEISEQGLHPSATTYSMLMHSLCKHGRLKEAFEVYETMEREGCHPDSVTFNILISGLCEDGKVDDGMGLLKTMQLKGCCPDAGTYQALLYGMLKDKRFVDAKNFMCKMISERRYPSFLSYKMTIEGLSMDALVRAGRCREAEAVFSRMLSSGILPDSVTYSILVKMHCFHRKHFASALHLIELMRDRGCQPDVVTYSTVIAGLSRSGKLDEAFAVLHSMLKIDCVLNAHVYTPILQAYCAQGKLHLAAQLLERMRAFGCPPDVVSYNVLIHALCKNGNFQDVDRILAESSENGWEPDSVSYNTYISGLCSMGMTDKALKQLDVMLSKALHPTDVTLNILLGLLCRRSNFVEIVFFLEKSLELGLPVGAATYNTAMTRLCEIGEFSAVLNLLSDMFKKRIQPDVQTFNIVILSFCRAGRIDKAKSFFKSRGFSPDIVTCNILLHEFCLAGEFDEFYKMLAEMSSGQVSPDAITYNIIVDGLCWD", 
                "T13333N0C0001G00117_7": "RSMLNRVPKPNVVLFNTVINIYVSAGRLLEANALFDRMMQRSYYPYALTYKILISGLCKEGKFGSIAHLLIEKIKAEGYKSDIFIFNSLIHGLCKDDQLEEAFTFYGNLLLDGFVANSGTYNTLIKALLQRGKQKESLELANDMVLYDLFVDVVDLNKLIKALCKSESVDKGLDLFEKMRQKGICPSMISCNVLIVRLCRKRRVQDALELFGKM", 
                "T13333N0C0001G00117_8": "FSDMSSRNISPYIVTYNILINGYFRSGKSVGAIMTFRRLPKFGFSPEIITYNTMLHGFFKSGKLGYVSRIFKEIKESDIVPNAITYTTVMKSLLKSGKFEQGLDIFLEMISRGYASEVFAYCSIISALVKRGAIEDAYNFSRQMLNNGIGIDLACYNVLVYVYCKEKNMNKAEALQIFSDMSSRNISPSIVTYNILINGYFRSGKSVSAIMTFRRLSKFGFSPEIITYNTMLHGFLKSGKLDYVSRIFKEIKESGIVPNAITYATVMKSLLRSGKFEQGLDIFLEMMSKGYASDVFAYCFIISALVKRGATEDAYHFSRQMLNNGIGIDLACYNVLLYVYCKEKNMNKAVQLLNEIEEAGLQSDEYTYSTLVDAFCKLGDVSSAWNYLRAMEINDISMKLVAFNCLIDGLCKVGQVNHALSVFNMMKTRDSFTYSILLHGLC", 
                "T13333N0C0001G00117_9": "LFHSLRYLAIPIDLVSCNIFLNCFTQAGRLSDALHLFGEMPGRGLAPDICTFNTLIKGCCRGNQISGAFRLLDGLHVQGLTPNEYTYSMLIXGSHLAADSEIGFGLVREMVQLGLEPSGVNYGCILAAVCREAKLGEAKVVYGRMIKVGIGLDIWAYNSFIGLLGRKRMVNEAKLLFFXMPDKGVVPNVDTFNVLIHGICVNGNLEEGLRILKGMVSRGPRPNIKPYTAVLHALCRRCMFDEALELFDDFPRQGLSSDKHMFGTLLFSLCNAGRVEVASRLFYQMESSGFNDIAAYNCLICGYCKLANMVAAKKLFVEALAKGMEPDVIMYTTLIDGPFKIGDIKVAQELVRDFGESGLKPDMIMFSTLMEGLCETESLAAALELLGEIVRQGLEPNSAIYTMLICAPCMARNFGKTVHFLEEM", 
                "T13333N0C0001G00117_10": "FYQIKAYKCRPTASAYNSIIIMLMQEGHHEKVHELYDEMCNEGNCFPDSVTYSALISSFCKLGRHDSAIRLFEEMKENGMQPMPRIYTTLITLFFNSKNVDRALDLFREMKQYCCAPNVFTYTELIKGLGKVGRVEDAYHYFLEMKKEGCKPDTVLMNNLLNVLSKARRLDDVLSLFGQMESFQCIPNVVTYNTVIKTLFESKSRASKVISWFEKMKERGIAPSSFTYSILIDGFCKSNRLEEALILLEEMDEKGFPPCPAAYCSLVDSLGKAKRYEAACELFQELKENCGSFSSRVYAVMIKHLGRCGRLNDAIDLFEEMKKLGCPPDAYAYNALMSGMVRIGIFDEAWSLFRTMQEQGCKPDINSYNIILNGLAKAGGPNPAMEMLSNMKQSVCKPDAVSYNTVLGSLSRAGMFEEAAKLMKEMSAMGFEYDLITYS", 
                "T13333N0C0001G00117_11": "IRRMREAGVKPDVITYNSLMAGASTYNLSSYCRHLFEEMSKVGIRADEWSYNTLMHCLFNCGCPEEALQIFSDMSSRNISPSIVTYNILINGYFISGKSVSAIMTFRRLPEFGFSPEIITYNTMLHGFFKSGKLGYVSRIFKEIKESGIVPNAITYTTVMKSLLRSGKFEQGLDIFLEMMSRGYASDVFAYCSIISALVKRGATEDAYNFSRQMLNNGIGIDLACYNVLLYVYCKEKNMNKAVQLLNEIEEAGLQSDEYTYSTLVDGFCKLGDVKEALQIFSDMSSRNISPYIVTYNILINGYFRSGKSVGAIMTFRRLPKFGFSPEIITYNTMLHGFFKSGKLGYVSRIFKEIKESDIVPNAITYTTVMKSLLKSGKFEQGLDIFLEMISRGYASEVFAYCSIISALVKRGAIEDAYNFSRQMLNNGIGIDLACYNVLVYVYCKEKNMNKAVQLLNEI", 
                "T13333N0C0001G00117_12": "FSSLIRRYARSGRPSEAIDAFHRMSDYGLEPDPASFSILLAALSRQRMAAQAQSLFDSFSARFPPDVVTYSNLVHAWCRAGDLDKAEQAFSAMRAAGVQPNVYTYTAAIDAMCRSGQIPRAQEVLCQMLDSGCAPNAATFNCFMRAHVRAGRAEQALQIHNQMRKLGCDPDGITYNFLIEAHCSKRQRNLDAALKILNSMIAKGKGCAPDANSFNHILKCVLNLGDVSAAHKLYEKMREIGCRPNTVTYNIFLQLFSKDKTLDMVLKMKKEMEKEEVEPNVNTYGVLITTFC"
    }

    """
    # rename seq
    num_site = len(str(len(seq_dict)))
    new_id_pattern = "S%0" + str(num_site) + "d"

    num = 0
    map_dict = {}
    for i in seq_dict:
        new_id = new_id_pattern % num
        map_dict[i] = new_id
        num += 1

    map_hash_dict = {}
    for i in map_dict:
        map_hash_dict[map_dict[i]] = i

    # write tmp file    
    work_dir = os.path.abspath(tmp_dir + "/" + uuid.uuid1().hex)
    mkdir(work_dir)
    with open(work_dir+"/input.fasta", 'w') as f:
        for seq_id in seq_dict:
            f.write(">%s\n%s\n" % (map_dict[seq_id], seq_dict[seq_id]))
    
    # run
    cmd_string = '%s/cd-hit -i input.fasta -o output %s' % (cd_hit_dir_path, args_string)
    cmd_run(cmd_string, cwd=work_dir, silence=True)

    # get repsent seq
    repsent_seqname = {}
    for i in read_fasta_big(work_dir+"/output"):
        repsent_seqname[i.seqname] = 0

    # get clstr
    output_file = work_dir + "/output.clstr"

    if os.path.exists(output_file) and os.path.getsize(output_file) != 0:
        cluster_dict = read_cd_hit_cluster(output_file)
        cluster_old_name_dict = {}
        for c_id in cluster_dict:
            tmp_list = []
            for new_seq_id in cluster_dict[c_id]:
                tmp_list.append(map_hash_dict[new_seq_id])
                if new_seq_id in repsent_seqname:
                    rep_seq_id = map_hash_dict[new_seq_id]
            cluster_old_name_dict[rep_seq_id] = tmp_list
    else:
        cluster_old_name_dict = None
    
    rmdir(work_dir)

    return cluster_old_name_dict
    


# # TODO
#
#
#
# # python /share/home/xuyuxing/python_xuyuxing/Genome_work_tools/Extract_seq_from_cd-hit.py cd-hit.0.65.clstr 0.65 all.uniq.fa
#
#
# cd_hit_clstr_file = sys.argv[1]
# output_prefix = sys.argv[2]
# fasta_file = sys.argv[3]
#
# F1 = open(cd_hit_clstr_file)
# all_text = F1.read()
# info = string.split(all_text, '\n>')
# while '' in info:
#     info.remove('')
# row = []
# cluster_dict = {}
# for i in info:
#     cluster_id = re.match(r'.*(Cluster \d+)\n.*', i)
#     if cluster_id:
#         cluster_id = cluster_id.group(1)
#
#     cluster_id = re.sub(r' ', '_', cluster_id)
#
#     cluster_dict[cluster_id] = []
#
#     seqs = string.split(i, '\n')
#     for seq in seqs:
#         seq_id = re.match(r'.*>(.*)\.\.\..*', seq)
#         if seq_id:
#             seq_id = seq_id.group(1)
#             cluster_dict[cluster_id].append(seq_id)
#
# bf.mkdir(output_prefix)
#
# fasta_dict, a = seq_base.read_fasta(fasta_file)
#
# for i in cluster_dict:
#     file_path = output_prefix + "/" + i + ".seq"
#     seq_base.extract_seq_to_fasta(file_path, cluster_dict[i], fasta_dict)
