#!/usr/bin/env python

# Copyright (C) 2016 Ian W. Harry
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

"""
Workflow generator for the 'uberbank' template bank construction. The script
as written runs a configurable example of pycbc_geom_aligned_bank and, unless
instructed otherwise, another idependent one of sbank. Typically, a standard
uberbank uses this round of sbank to cover BBH sources, while a PyGRB uberbank
does not require performing this step: this step can be avoided with the
option skip-coarse-bank = in the section [workflow-bank_structure]. The script
then combines these two banks together, if the BBH sbank was indeed generated.
Finally, it runs a second example of sbank (configuration file tag:
[sbank-final]) on the resulting output. It is foreseen that this script would
be altered to generate workflows in slightly different configurations to the
one described, so hopefully the script is clear enough to allow someone to do
that.
"""

#imports
from __future__ import division
import os
import argparse
import logging
from Pegasus import DAX3 as dax
import pycbc
import pycbc.version
import pycbc.workflow as wf

# Boiler-plate stuff
__author__  = "Ian Harry <ian.harry@ligo.org>"
__version__ = pycbc.version.git_verbose_msg
__date__    = pycbc.version.date
__program__ = "pycbc_make_uberbank_workflow"

# Define classes for executables

class GeomBankExecutable(wf.Executable):
    """ Class for running pycbc_geom_aligned_bank.
    """
    # This outputs a dax file.
    current_retention_level = wf.Executable.FINAL_RESULT

    # This tells us about input file options
    file_input_options = ['--psd-file', '--asd-file',
                          '--fake-strain-from-file', '--injection-file',
                          '--sgburst-injection-file', '--gating-file']

    def create_node(self, analysis_time, geom_out_file, out_storage_path,
                    workflow_name='geom_bank'):
        node = wf.Executable.create_node(self)
        
        # Output file: This one is done weirdly because this job doesn't
        # actually produce this file, the resulting dax will do that.
        node.add_opt('--output-file', geom_out_file.storage_path)
      

        node.add_opt('--storage-path-base', out_storage_path)      
        node.add_opt('--is-sub-workflow','')

        # Intermediate outputs and dax files
        node.new_output_file_opt(analysis_time, '.hdf',
                                 '--intermediate-data-file',
                                 tags=self.tags + ['INTERMEDIATE'])
        node.new_output_file_opt(analysis_time, '.xml',
                                 '--metadata-file', 
                                 tags=self.tags + ['METADATA'])
        node.new_output_file_opt(analysis_time, '.dax',
                                 '--dax-filename',
                                 tags=self.tags + ['DAX'])
        node.new_output_file_opt(analysis_time, '.map',
                                 '--map-filename',
                                 tags=self.tags + ['MAP'])
        node.new_output_file_opt(analysis_time, '.tc.txt',
                                 '--transformation-catalog-filename',
                                 tags=self.tags + ['TRANSFORMATION_CATALOG'])
        return node

class SbankDaxGenerator(wf.Executable):
    """ Class for running pycbc_make_sbank_workflow.
    """
    # This outputs a dax file.
    current_retention_level = wf.Executable.FINAL_RESULT

    # This tells us about input file options
    file_input_options = []
 
    def create_node(self, analysis_time, config_file, out_storage_path,
                    output_file, seed_files=None,
                    workflow_name='sbank_workflow', sub_wf_tags=None):
        """ Create a sbank dax generator node.
        """
        node = wf.Executable.create_node(self)

        # Output file: This one is done weirdly because this job doesn't
        # actually produce this file, the resulting dax will do that.
        node.add_opt('--output-file', output_file.storage_path)


        node.add_opt('--workflow-name', workflow_name)
        node.add_opt('--output-dir', out_storage_path)
        node.add_opt('--is-sub-workflow','')

        if sub_wf_tags is not None:
            node.add_opt('--tags', ' '.join(sub_wf_tags))

        node.add_input_opt('--config-files', config_file)

        config_overrides = []

        if seed_files is not None:
            sfs = []
            for seed_file in seed_files:
                seed_file = os.path.basename(seed_file)
                sfs.append(seed_file)
            config_overrides.append('workflow:seed-bank:"%s"' % ' '.join(sfs))

        if len(config_overrides):
            node.add_opt('--config-overrides', ' '.join(config_overrides))
                                       

        node.new_output_file_opt(analysis_time, '.dax',
                                 '--dax-filename',
                                 tags=self.tags + sub_wf_tags + ['DAX'])
        node.new_output_file_opt(analysis_time, '.map',
                                 '--map-filename',
                                 tags=self.tags + sub_wf_tags + ['MAP'])
        node.new_output_file_opt(analysis_time, '.tc.txt',
                                 '--transformation-catalog-filename',
                                 tags=self.tags + sub_wf_tags +
                                 ['TRANSFORMATION_CATALOG'])

        return node

##############################################################################
# Argument parsing and setup of workflow                                     #
##############################################################################


# Use the standard workflow command-line parsing routines. Things like a 
# configuration file are specified within the "workflow command line group"
# so run this with --help to see what options are added.
_desc = __doc__[1:]
parser = argparse.ArgumentParser(description=_desc)
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument("--workflow-name", type=str, default='sbank_workflow',
                    help="Descriptive name of the analysis.")
parser.add_argument("-d", "--output-dir", default='./',
                    help="Path to output directory.")
wf.add_workflow_command_line_group(parser)
args = parser.parse_args()

# Create the workflow object
workflow = wf.Workflow(args, args.workflow_name)

wf.makedir(args.output_dir)
os.chdir(args.output_dir)
wf.makedir('daxes')

##############################################
# Run the geom_bank workflow the first time  #
##############################################
logging.info("Setting up the round of geometric bank jobs.")
geom_exe = GeomBankExecutable(workflow.cp, 'geom_aligned_bank',
                              ifos=workflow.ifos, out_dir='daxes')
# This one's a bit weird as the dag generator names the final output file,
# but that job doesn't *generate* it, so we create this file first.
# However, this file must *not* appear in the output mapper, as then the 
# sub-workflow will think that it already exists. So for now we are setting
# store_file=False and relying on the sub-workflow to specify the PFN for this
# file. Hopefully this will work if we need to add this file as a dependancy
# for a later sub-workflow.
geom_out_dir = os.path.abspath('initial_geom_bank')
geom_out_file = wf.File(geom_exe.ifo_list, geom_exe.name,
                        workflow.analysis_time, extension='.h5',
                        store_file=True, directory=geom_out_dir,
                        tags=[], use_tmp_subdirs=False)
geom_node = geom_exe.create_node(workflow.analysis_time, geom_out_file,
                                 geom_out_dir,
                                 workflow_name='initial_geom_bank')
workflow += geom_node

# Verify files are the correct ones, they have unique extensions
intermediate_file = geom_node.output_files[0]
assert(intermediate_file.name.endswith('hdf'))
metadata_file = geom_node.output_files[1]
assert(metadata_file.name.endswith('xml'))
dax_file = geom_node.output_files[2]
assert(dax_file.name.endswith('.dax'))
map_file = geom_node.output_files[3]
assert(map_file.name.endswith('.map'))
tc_file = geom_node.output_files[4]
assert(tc_file.name.endswith('.tc.txt'))

# NOTE: One can use workflow += dax_job, but this seems to miss a bunch of
#       the stuff done below, and does not consider file management
dax_job = dax.DAX(dax_file)
dax_job.addArguments('--basename %s' % \
                     os.path.splitext(os.path.basename(dax_file.name))[0])
wf.Workflow.set_job_properties(dax_job, map_file, tc_file)

# Tell the dax job that it needs some input files
dax_job.uses(metadata_file, link=dax.Link.INPUT, register=False, transfer=True)
dax_job.uses(intermediate_file, link=dax.Link.INPUT, register=False,
             transfer=True)

# And this output is used again so declare it. See notes at bottom!!!
#dax_job.uses(geom_out_file, link=dax.Link.OUTPUT, register=False,
#             transfer=False)

workflow._adag.addJob(dax_job)
dep = dax.Dependency(parent=geom_node._dax_node, child=dax_job)
workflow._adag.addDependency(dep)
geom_dax_job = dax_job

###############################################
# Run the sbank workflow for the first time   #
###############################################

sbank_exe = SbankDaxGenerator(workflow.cp, 'sbank_workflow',
                              ifos=workflow.ifos, out_dir='daxes')

config_path = os.path.abspath('daxes' + '/' + 'sbank_workflow.ini')
workflow.cp.write(open(config_path, 'w'))
config_file = wf.File.from_path(config_path)

# This first round of sbank is used for BBHs for the standard uberbank;
# a PyGRB uberbank covers only potentially EM-bright systems and therfore
# skips this step.
if not workflow.cp.has_option("workflow-bank_structure", "skip-coarse-bank"):
    logging.info("Setting up the (optional) first round of sbank jobs.")
    sbank_out_file_bbh = wf.File(sbank_exe.ifo_list, sbank_exe.name,
                                 workflow.analysis_time, extension='.h5',
                                 store_file=True, directory=sbank_exe.out_dir,
                                 tags=['sbank_bbh'], use_tmp_subdirs=False)
    sbank_node = sbank_exe.create_node(workflow.analysis_time, config_file,
                                       os.path.abspath('sbank_bbh'),
                                       sbank_out_file_bbh,
                                       workflow_name='sbank_bbh',
                                       sub_wf_tags=['bbh'])

    workflow += sbank_node

    dax_file = sbank_node.output_files[0]
    map_file = sbank_node.output_files[1]
    tc_file = sbank_node.output_files[2]
    dax_job = dax.DAX(dax_file)
    dax_job.addArguments('--basename %s' % \
                         os.path.splitext(os.path.basename(dax_file.name))[0])
    wf.Workflow.set_job_properties(dax_job, map_file, tc_file)

    workflow._adag.addJob(dax_job)
    dep = dax.Dependency(parent=sbank_node._dax_node, child=dax_job)
    workflow._adag.addDependency(dep)
    sbank_bbh_dax_job = dax_job

###############################################
# Run the sbank workflow for the second time  #
###############################################

logging.info("Setting up the final round of sbank jobs.")
sbank_out_file_final = wf.File(sbank_exe.ifo_list, sbank_exe.name,
                         workflow.analysis_time, extension='.h5',
                         store_file=True, directory='.',
                         tags=['sbank_final'], use_tmp_subdirs=False)

seed_files = [geom_out_file.name]
if not workflow.cp.has_option("workflow-bank_structure", "skip-coarse-bank"):
    seed_files = [geom_out_file.name, sbank_out_file_bbh.name]

sbank_node = sbank_exe.create_node(workflow.analysis_time, config_file,
                                   os.path.abspath('sbank_final'),
                                   sbank_out_file_final,
                                   seed_files=seed_files,
                                   workflow_name='sbank_final',
                                   sub_wf_tags=['final'])

workflow += sbank_node

dax_file = sbank_node.output_files[0]
map_file = sbank_node.output_files[1]
tc_file = sbank_node.output_files[2]
dax_job = dax.DAX(dax_file)
dax_job.addArguments('--basename %s' % \
                     os.path.splitext(os.path.basename(dax_file.name))[0])
wf.Workflow.set_job_properties(dax_job, map_file, tc_file)


workflow._adag.addJob(dax_job)
dep = dax.Dependency(parent=sbank_node._dax_node, child=dax_job)
workflow._adag.addDependency(dep)
if not workflow.cp.has_option("workflow-bank_structure", "skip-coarse-bank"):
    dep = dax.Dependency(parent=sbank_bbh_dax_job, child=dax_job)
    workflow._adag.addDependency(dep)
dep = dax.Dependency(parent=geom_dax_job, child=dax_job)
workflow._adag.addDependency(dep)


workflow.save()

# NOTES: Here are things that I found during writing this that will be useful
#        if changing this around in the future.

# 1: If we have two sub-daxes with a file dependancy (ie. dax B uses a file
#    produced from dax A). Even if these are not *direct* parent-child related
#    one must add an explicit dax dependency between the two sub-daxes:
#    dep = dax.Dependency(parent=parent_dax_job, child=child_dax_job)

# 2: If a sub-dax uses as input a file from the parent workflow, one can a
#    dax.uses() command to list that file as an input. This is probably
#    unncessary but it does avoid potential issues with cleanup jobs. The
#    input mapper should be listed as an input! Should the dax file itself
#    also be listed? Currently it is not

# 3: The case of a sub-dax producing an output file used by a job (not another
#    sub-dax) in the uberworkflow appears unsupported. If I do not list the
#    file as an output of the dax job - with dax.uses - then the uberworkflow
#    will not plan as it does not understand where that file comes from.
#    However, if the file *is* listed as an output it will be sent in the
#    input map catalog to the sub-dax (which contains the local locations of
#    all files in the uber-workflow). Then the sub-workflow thinks that this
#    file already exists and does not run the job that generates it.
