#!/bin/bash

# runs over files generated by Proper.
# it creates CSV-files (delimiter is ";") for US and german Spreadsheet
# Applications. Additionally it creates LaTeX tables from the data.
# 
# $Revision: 1.2 $
# FracPete

#############
# Functions #
#############

# the usage of this script
function usage()
{
   echo
   echo "usage: ${0##*/} -i <input-dir> -o <output-dir> -n <filename> [-h]"
   echo "       [-p] [-e] [-r] [-t]"
   echo 
   echo "creates CSV-files and LaTeX-Tables (german/US-american) from the generated" 
   echo "ARFF stat files"
   echo
   echo " -h   this help"
   echo " -i   <input-dir>"
   echo "      the directory where the ARFF files are located"
   echo "      default: $DIR"
   echo " -o   <output-dir>"
   echo "      the directory where to store the CSV/LaTeX files"
   echo "      default: $OUTDIR"
   echo " -n   <filename>"
   echo "      the filename of the output (CSV/LaTeX)"
   echo "      default: $PREFIX"
   echo " -p   include parameters in file"
   echo " -e   check only for exceptions"
   echo " -r   check only for runtime"
   echo " -t   check only for treesize"
   echo
}

# creates the NOBLANK, ATTR and DATA file from file "TMPFILE"
function create_files()
{
   cat $TMPFILE | sed s/{\r}// | grep -v "^$" > $NOBLANK_FILE
   csplit -s $NOBLANK_FILE /@data/ --prefix=$PREFIX
   cat $PREFIX"00" | grep "^@attribute" > $ATTR_FILE
   cat $PREFIX"01" | grep -v "^@data" > $DATA_FILE
}

# looks for a certain pattern "TMP" in file "TMPFILE" and stores the line 
# number in "TMP"
function get_line_by_pattern()
{
   TMP=`cat $TMPFILE | grep -n "$TMP" | cut -f1 -d":"`
}

# looks for attribute "TMP" and stores the column in "TMP" again
function get_column()
{
   TMPFILE=$ATTR_FILE
   get_line_by_pattern
}

# extracts the data from column number "TMP" and stores that in "TMP"
function get_col_data()
{
   if [ "$FIRST" = "yes" ]
   then
      TMP=`head -n1 $DATA_FILE | sed s/"first,last"/"first.last"/g | cut -f$TMP -d","`
   else
      TMP=`cat $DATA_FILE | sed s/"first,last"/"first.last"/g | cut -f$TMP -d","`
   fi
}

# retrieves from the "TMPFILE" the number of classes and stores them in "TMP"
# Notes: class must be the last attribute in the file!
#        we change all classes like '...' to #, since otherwise get confused
#        with the " " or ","
function get_classcount()
{
   LINE=`tail -n1 $TMPFILE`
   TMP=`echo $LINE | sed s/.*\{//g | sed s/\}.*//g | sed s/"'"[^\']*"'"/"#"/g | sed s/","/" "/g | wc -w | sed s/" "*//g`
}

# retrieves the runs from file "TMPFILE" and stores them in "TMP"
function get_runs()
{
   if [ -f $TMPFILE ]
   then
      TMP=`grep "Determining Runs" $TMPFILE | sed s/.*"... "//g`
      # not specified -> 10
      if [ "$TMP" = "" ]
      then
         TMP="10"
      fi
   else
      TMP=""
   fi
}

# retrieves the folds from file "TMPFILE" and stores them in "TMP"
function get_folds()
{
   if [ -f $TMPFILE ]
   then
      TMP=`grep "Determining Folds" $TMPFILE | sed s/.*"... "//g`
      # not specified -> 10
      if [ "$TMP" = "" ]
      then
         TMP="10"
      fi
   else
      TMP=""
   fi
}

# returns the number of attributes from file "TMPFILE" in "TMP" 
function get_attcount()
{
   TMP=`cat $TMPFILE | wc -l | sed s/" "*//g`
}

# returns the number of instances/bags from file "TMPFILE" in "TMP"
function get_instcount()
{
   if [ -f $TMPFILE ]
   then
      TMP=`cat $TMPFILE | grep "Total Number of" | head -n 1 | sed s/.*"Total Number of "[^\ ]*//g | sed s/" "*//g`
   else
      TMP=
   fi
}

# returns the number of records from file "TMPFILE" in "TMP"
function get_reccount()
{
   if [ -f $TMPFILE ]
   then
      if [ "$TESTER" = "no" ]
      then
         TMP=`cat $TMPFILE | grep "Record-Count" | head -n 1 | sed s/.*":"//g | sed s/" "*//g`
      else
         TMP=`cat $TMPFILE | grep "Record-Count" | tail -n 1 | sed s/.*":"//g | sed s/" "*//g`
      fi
   else
      TMP=
   fi
}

# returns the number of lines from file "TMPFILE" in "TMP"
function get_linecount()
{
   TMP=`cat $TMPFILE | wc -l | sed s/" "*//g`
}


# retrieves the parameters with which the class was run, it is looking in file
# "TMPFILE" for the string "TMP" and stores the result in "TMP"
function get_parameters()
{
   if [ -f $TMPFILE ]
   then
      TMP=`grep "$TMP" $TMPFILE | sed s/.*": "//g | sed s/\"//g`
   else
      TMP=""
   fi
}

# retrieves the target field of the table from file "TMPFILE" and stores it
# in "TMP"
function get_target()
{
   if [ -f $TMPFILE ]
   then
      TMP="proper.app.Proper:";get_parameters;TMP2=$TMP
      TMP=`echo $TMP2 | sed s/.*"-table "//g | sed s/" ".*//g`"."`echo $TMP2 | sed s/.*"-field "//g | sed s/" ".*//g`
   else
      TMP=""
   fi
}

# returns the mean of the column TMP (is divided by DATA_COUNT) and 
# returns it in TMP again (rounded to 2 decimals)
function get_mean()
{
   DECIMALS="2"
   TMP=`echo $TMP | sed s/" "/"+"/g | sed s/^/"("/g | sed s/$/")\/$DATA_COUNT * $DECIMALS \/ $DECIMALS"/g`
   TMP=`echo "scale=$DECIMALS; $TMP" | bc -l | sed s/^","/"0,"/g | sed s/^[0-9]$/\&.00/g | sed s/^"\."/0./g`
}

# returns the standard deviation from the column TMP (n = DATA_COUNT) and
# returns it in TMP again
function get_stddev()
{
   DECIMALS="2"
   # this is the unbiased sample stddev
   TMP="sqrt(($DATA_COUNT * ("`echo $TMP | sed s/" "/"^2+"/g | sed s/$/"^2"/g`") - ("`echo $TMP | sed s/" "/"+"/g`")^2) / ($DATA_COUNT * ($DATA_COUNT - 1)))"
   # this is population stddev
   #TMP="sqrt(($DATA_COUNT * ("`echo $TMP | sed s/" "/"^2+"/g | sed s/$/"^2"/g`") - ("`echo $TMP | sed s/" "/"+"/g`")^2) / ($DATA_COUNT^2))"
   # processing with bc
   TMP=`echo "scale=$DECIMALS; $TMP" | bc -l | sed s/^","/"0,"/g | sed s/^[0-9]$/\&.00/g | sed s/^"\."/0./g`
}

# returns an exception if any is in file TMPFILE, returns it in TMP
function get_exception()
{
   if [ -f $TMPFILE ]
   then
      TMP=`cat $TMPFILE | grep -i exception`
   else
      TMP=""
   fi
}

# returns the runtime if any is in file TMPFILE, returns it in TMP
function get_runtime()
{
   if [ -f $TMPFILE ]
   then
      TMP=`cat $TMPFILE | grep -i Runtime: | sed s/.*" "//g | sed s/[^0-9]*$//g`
   else
      TMP="-"
   fi
}

# returns the treesize (tree/leaves) if any is in file TMPFILE in TMP
function get_treesize()
{
   if [ -f $TMPFILE ]
   then
      TMP=`cat $TMPFILE | grep "Size of the tree" | sed s/.*": "//g`
      TMPCOUNT=`echo $TMP | wc -w | sed s/" "*//g`
      if [ "$TMPCOUNT" != "0" ]
      then
         TMP="=("`echo $TMP | sed s/" "/+/g`")/$TMPCOUNT"
      else
         TMP="-"
      fi
   else
      TMP="-"
   fi
}

# counts the lines of the file TMPFILE and returns the result in TMP
function count_lines()
{
   TMP=`cat $TMPFILE | grep -v "^$\|^#" | wc -l | sed s/" "*//g`
}

# returns the specified line (in LINE) from the file TMPFILE in TMP
function get_line()
{
   TMP=`cat $TMPFILE | grep -v "^$\|^#" | head -n $LINE | tail -n 1`
}

# cleans up the temporary files
function clean_up()
{
   rm -f $ATTR_FILE
   rm -f $DATA_FILE
   rm -f $NOBLANK_FILE
   rm -f $PREFIX*
}

# extracts data from statistics arff-file created by the experimenter
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_experimenter()
{
   echo "$TITLE..."

   INSERTED="no"

   # CSV-Header
   HEADER="$TITLE\nexperiment;target;classifier;options;classes;attributes;records (target/result);instances/bags;runs/folds;correct;incorrect;correct (%);incorrect (%);stddev"
   echo -e $HEADER >> $OUTPUT
   echo -e $HEADER >> $OUTPUT_DE

   # clean up before we start
   clean_up

   # run over all "stat"-files
   for i in $DIR/*-stat$SUFFIX.arff
   do
      FILE=$i

      # zero length?
      if [ ! -s $FILE ]
      then
         continue
      fi

      # already header inserted?
      if [ "$INSERTED" = "no" ]
      then
         INSERTED="yes"

         # LaTeX header
         HEADER="% $TITLE\n\n\\\\begin{table}[hbtp]\n\t\\\\begin{center}\n\t\t\\\\begin{tabular}{|l|r|r|r|r r|}\n\t\t\t\\hline\n\t\t\t\\\\textbf{Dataset} & \\\\textbf{Attributes} & \\\\textbf{Instances/Bags} & \\\\textbf{Classes} & \multicolumn{2}{|r|}{\\\\textbf{Accuracy in \\%}} \\\\\\ \n\t\t\t\\hline"
         echo -e $HEADER >> $LATEX
         echo -e $HEADER >> $LATEX_DE
      fi
      
      # progress info
      EXPERIMENT=`echo ${FILE##*/} | cut -f1 -d"." | sed s/-stat${SUFFIX}//g`
      echo " - $EXPERIMENT"

      # split files
      TMPFILE="$FILE";create_files

      # analyze data
      TMPFILE="$ATTR_FILE";TMP="Key_Scheme ";get_column;CLASSIFIER_COL=$TMP
      TMPFILE="$ATTR_FILE";TMP="Key_Scheme_options ";get_column;OPTIONS_COL=$TMP
      TMPFILE="$ATTR_FILE";TMP="Number_correct";get_column;CORRECT_COL=$TMP
      TMPFILE="$ATTR_FILE";TMP="Number_incorrect";get_column;INCORRECT_COL=$TMP
      TMPFILE="$ATTR_FILE";TMP="Percent_correct";get_column;CORRECT_PERC_COL=$TMP
      TMPFILE="$ATTR_FILE";TMP="Percent_incorrect";get_column;INCORRECT_PERC_COL=$TMP

      # extract data
      FIRST="yes"
      TMP=$CLASSIFIER_COL;get_col_data;CLASSIFIER=$TMP
      TMP=$OPTIONS_COL;get_col_data;OPTIONS=`echo $TMP | sed s/"'"/"\""/g | sed s/"="/"-"/g`

      FIRST="no"
      TMP=$CORRECT_COL;get_col_data;CORRECT_STR=$TMP
      TMP=$INCORRECT_COL;get_col_data;INCORRECT_STR=$TMP
      TMP=$CORRECT_PERC_COL;get_col_data;CORRECT_PERC_STR=$TMP
      TMP=$INCORRECT_PERC_COL;get_col_data;INCORRECT_PERC_STR=$TMP

      # process data
      TMPFILE="$DATA_FILE";get_linecount;DATA_COUNT=$TMP
      TMP="$CORRECT_STR";get_mean;CORRECT=$TMP
      TMP="$INCORRECT_STR";get_mean;INCORRECT=$TMP
      TMP="$CORRECT_PERC_STR";get_mean;CORRECT_PERC=$TMP
      TMP="$INCORRECT_PERC_STR";get_mean;INCORRECT_PERC=$TMP
      TMP="$CORRECT_PERC_STR";get_stddev;CORRECT_STDDEV=$TMP
      CORRECT_DE=`echo $CORRECT | sed s/"\."/","/g`
      INCORRECT_DE=`echo $INCORRECT | sed s/"\."/","/g`
      CORRECT_PERC_DE=`echo $CORRECT_PERC | sed s/"\."/","/g`
      INCORRECT_PERC_DE=`echo $INCORRECT_PERC | sed s/"\."/","/g`
      CORRECT_STDDEV_DE=`echo $CORRECT_STDDEV | sed s/"\."/","/g`

      # other data
      TMPFILE="$DIR/$EXPERIMENT$SUFFIX.arff";create_files
      TMPFILE="$DIR/$EXPERIMENTi$SUFFIX.proper";get_target;TARGET=$TMP
      TMPFILE="$ATTR_FILE";get_classcount;CLASSES=$TMP
      TMPFILE="$DIR/$EXPERIMENT$SUFFIX.eval";get_runs;RUNS=$TMP
      TMPFILE="$DIR/$EXPERIMENT$SUFFIX.eval";get_folds;FOLDS=$TMP
      TMPFILE="$ATTR_FILE";get_attcount;ATTRIBUTES=$TMP
      TMPFILE="$DIR/$EXPERIMENT$SUFFIX.proper";get_reccount;RECORDS=$TMP
      TMPFILE="$DIR/$EXPERIMENT$SUFFIX.eval";get_instcount;INSTANCES=$TMP

      # csv-file
      VALUES="$EXPERIMENT;$TARGET;$CLASSIFIER;$OPTIONS;$CLASSES;$ATTRIBUTES;\"$RECORDS\";$INSTANCES;\"$RUNS|$FOLDS\""
      LINE="$VALUES;$CORRECT;$INCORRECT;$CORRECT_PERC;$INCORRECT_PERC;$CORRECT_STDDEV"
      LINE_DE="$VALUES;\"$CORRECT_DE\";\"$INCORRECT_DE\";\"$CORRECT_PERC_DE\";\"$INCORRECT_PERC_DE\";\"$CORRECT_STDDEV_DE\""
      echo $LINE    >> $OUTPUT
      echo $LINE_DE >> $OUTPUT_DE

      # LaTeX-file
      VALUES="`echo $EXPERIMENT | sed s/"_"/"\\\\\\_"/g` & $ATTRIBUTES & $INSTANCES & $CLASSES"
      LINE="\t\t\t$VALUES & $CORRECT_PERC & $\\pm$ $CORRECT_STDDEV \\\\\\\\"
      LINE_DE="\t\t\t$VALUES & $CORRECT_PERC_DE & $\\pm$ $CORRECT_STDDEV_DE \\\\\\\\"
      echo -e $LINE    >> $LATEX
      echo -e $LINE_DE >> $LATEX_DE

      # clean up
      clean_up
   done

   # CSV
   FOOTER=""
   echo -e $FOOTER >> $OUTPUT
   echo -e $FOOTER >> $OUTPUT_DE

   # LaTeX
   if [ "$INSERTED" = "yes" ]
   then
      FOOTER="\t\t\t\\hline\n\t\t\\\\end{tabular}\n\t\t\\\\newline\n\t\t\\\\caption{Results for $TITLE}\n\t\\\\end{center}\n\\\\end{table}\n\n"
      echo -e $FOOTER >> $LATEX
      echo -e $FOOTER >> $LATEX_DE
   fi
}

# extracts data from the logfile from the test-run
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_tester()
{
   echo "$TITLE..."

   INSERTED="no"

   # CSV header
   HEADER="$TITLE\nexperiment;;classifier;options;;;;;;correct;incorrect;correct (%);incorrect (%);"
   echo -e $HEADER >> $OUTPUT
   echo -e $HEADER >> $OUTPUT_DE

   # run over all "stat"-files
   for i in $DIR/*-stat$SUFFIX.arff
   do
      FILE=$i
      EXPERIMENT=`echo ${FILE##*/} | cut -f1 -d"." | sed s/"-stat$SUFFIX"//g`
      
      # does it exist?
      if [ ! -f $DIR/$EXPERIMENT$SUFFIX".test" ]
      then
         continue
      fi
      
      # already header inserted?
      if [ "$INSERTED" = "no" ]
      then
         INSERTED="yes"

         # LaTeX header
         HEADER="% $TITLE\n\n\\\\begin{table}[hbtp]\n\t\\\\begin{center}\n\t\t\\\\begin{tabular}{|l|r|r|r|r|r|}\n\t\t\t\\hline\n\t\t\t\\\\textbf{Dataset} & \\\\textbf{Accuracy in \\%} \\\\\\\n\t\t\t\\hline"
         echo -e $HEADER >> $LATEX
         echo -e $HEADER >> $LATEX_DE
      fi
      
      # does it contain any data?
      TMP=`grep "Nothing to test" $DIR/$EXPERIMENT$SUFFIX".test"`
      if [ ! "$TMP" = "" ]
      then
         continue
      fi
      
      # progress info
      echo " - $EXPERIMENT"

      # data
      TMP=`grep "Tester:" $DIR/$EXPERIMENT$SUFFIX".test" | sed s/.*": "//g`
      CLASSIFIER=`echo $TMP | sed s/.*"-classifier "//g | sed s/" ".*//g`
      OPTIONS=`echo $TMP | sed s/.*"-additional "//g | sed s/"-append_log".*//g | sed s/"="/"-"/g`
      CORRECT_STR=`grep "Correctly Classified" $DIR/$EXPERIMENT$SUFFIX".test" | tail -n1 | sed s/.*"Instances"//g | sed s/.*"Exemplars"//g | sed s/^" "*//g | sed s/" "*"%"$//g` 
      INCORRECT_STR=`grep "Incorrectly Classified" $DIR/$EXPERIMENT$SUFFIX".test" | tail -n1 | sed s/.*"Instances"//g | sed s/.*"Exemplars"//g | sed s/^" "*//g | sed s/" "*"%"$//g` 
      CORRECT=`echo $CORRECT_STR | cut -f1 -d" "`
      CORRECT_DE=`echo $CORRECT | sed s/"\."/","/g`
      CORRECT_PERC=`echo $CORRECT_STR | sed s/^[0-9]*" "*//g`
      CORRECT_PERC_DE=`echo $CORRECT_PERC | sed s/"\."/","/g`
      INCORRECT=`echo $INCORRECT_STR | cut -f1 -d" "`
      INCORRECT_DE=`echo $INCORRECT | sed s/"\."/","/g`
      INCORRECT_PERC=`echo $INCORRECT_STR | sed s/^[0-9]*" "*//g`
      INCORRECT_PERC_DE=`echo $INCORRECT_PERC | sed s/"\."/","/g`

      # csv-file
      VALUES="$EXPERIMENT;;$CLASSIFIER;$OPTIONS;;;;;"
      LINE="$VALUES;$CORRECT;$INCORRECT;$CORRECT_PERC;$INCORRECT_PERC;"
      LINE_DE="$VALUES;\"$CORRECT_DE\";\"$INCORRECT_DE\";\"$CORRECT_PERC_DE\";\"$INCORRECT_PERC_DE\";"
      echo $LINE    >> $OUTPUT
      echo $LINE_DE >> $OUTPUT_DE

      # LaTeX-file
      VALUES="`echo $EXPERIMENT | sed s/"_"/"\\\\\\_"/g` & $INSTANCES"
      LINE="\t\t\t$VALUES & $CORRECT_PERC \\\\\\\\"
      LINE_DE="\t\t\t$VALUES & $CORRECT_PERC_DE \\\\\\\\"
      echo -e $LINE    >> $LATEX
      echo -e $LINE_DE >> $LATEX_DE
   done

   # CSV
   FOOTER=""
   echo -e $FOOTER >> $OUTPUT
   echo -e $FOOTER >> $OUTPUT_DE

   # LaTeX
   if [ "$INSERTED" = "yes" ]
   then
      FOOTER="\t\t\t\\hline\n\t\t\\\\end{tabular}\n\t\t\\\\newline\n\t\t\\\\caption{Results for $TITLE}\n\t\\\\end{center}\n\\\\end{table}\n"
      echo -e $FOOTER >> $LATEX
      echo -e $FOOTER >> $LATEX_DE
   fi
}

# extracts exceptions for the different experiments
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_exceptions()
{
   echo "Exceptions..."

   HEADER="File;Exception"
   
   echo -e $HEADER >> $EXCEPTION

   # run over all experiments
   for ((i = 1; i <= $COUNT_DATASETS; i++))
   do
      LINE=$i;TMPFILE=$DATASETS;get_line;EXPERIMENT=$TMP

      # progress info
      echo " - $EXPERIMENT"

      # data
      TMPFILE="$DIR/$EXPERIMENT.db";get_exception;DATABASES=$TMP
      TMPFILE="$DIR/$EXPERIMENT.proper";get_exception;RUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT.import";get_exception;IMPORT=$TMP
      TMPFILE="$DIR/$EXPERIMENT.export";get_exception;EXPORT=$TMP
      TMPFILE="$DIR/$EXPERIMENT.eval";get_exception;EVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT.class";get_exception;CLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT.test";get_exception;TESTER=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.proper";get_exception;MIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.eval";get_exception;MIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.class";get_exception;MICLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.test";get_exception;MITESTER=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.proper";get_exception;REMIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.eval";get_exception;REMIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.class";get_exception;REMICLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.test";get_exception;REMITESTER=$TMP

      # csv-file
      LINE="$EXPERIMENT;\n;\"$DATABASES\"\n"
      LINE=$LINE";import;\"$IMPORT\"\n"
      LINE=$LINE";proper;\"$RUN\"\n"
      LINE=$LINE";export;\"$EXPORT\"\n"
      LINE=$LINE";evaluate;\"$EVAL\"\n"
      LINE=$LINE";classify;\"$CLASSIFY\"\n"
      LINE=$LINE";test;\"$TESTER\"\n"
      LINE=$LINE";mi-proper;\"$MIRUN\"\n"
      LINE=$LINE";mi-evaluate;\"$MIEVAL\"\n"
      LINE=$LINE";mi-classify;\"$MICLASSIFY\"\n"
      LINE=$LINE";mi-test;\"$MITESTER\"\n"
      LINE=$LINE";remi-proper;\"$REMIRUN\"\n"
      LINE=$LINE";remi-evaluate;\"$REMIEVAL\"\n"
      LINE=$LINE";remi-classify;\"$REMICLASSIFY\"\n"
      LINE=$LINE";remi-test;\"$REMITESTER\"\n\n"

      echo -e $LINE >> $EXCEPTION
   done
}

# extracts the runtime for the different experiments
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_runtime()
{
   echo "Runtime..."

   HEADER="Dataset;Import;RELAGGS;RELAGGS-Evaluate;MILK;MILK-Evaluate;REMILK;REMILK-Evaluate"
   
   echo -e $HEADER >> $RUNTIME

   # run over all experiments
   for ((i = 1; i <= $COUNT_DATASETS; i++))
   do
      LINE=$i;TMPFILE=$DATASETS;get_line;EXPERIMENT=$TMP

      # progress info
      echo " - $EXPERIMENT"

      # data
      TMPFILE="$DIR/$EXPERIMENT.import";get_runtime;IMPORT=$TMP
      TMPFILE="$DIR/$EXPERIMENT.proper";get_runtime;RUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT.eval";get_runtime;EVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.proper";get_runtime;MIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.eval";get_runtime;MIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.proper";get_runtime;REMIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.eval";get_runtime;REMIEVAL=$TMP

      # if not finished -> set to "-"
      if [ ! -s "$DIR/$EXPERIMENT-stat.arff" ]
      then
         EVAL="-"
      fi
      if [ ! -s "$DIR/$EXPERIMENT-stat-mi.arff" ]
      then
         MIEVAL="-"
      fi
      if [ ! -s "$DIR/$EXPERIMENT-stat-remi.arff" ]
      then
         REMIEVAL="-"
      fi

      # csv-file
      LINE="$EXPERIMENT;$IMPORT;$RUN;$EVAL;$MIRUN;$MIEVAL;$REMIRUN;$REMIEVAL"

      echo -e $LINE >> $RUNTIME
   done
}

# extracts the tree sizes for the different experiments
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_treesize()
{
   echo "Tree size..."

   HEADER="Dataset;RELAGGS;MILK;REMILK"
   
   echo -e $HEADER >> $TREE

   # run over all experiments
   for ((i = 1; i <= $COUNT_DATASETS; i++))
   do
      LINE=$i;TMPFILE=$DATASETS;get_line;EXPERIMENT=$TMP

      # progress info
      echo " - $EXPERIMENT"

      # data
      TMPFILE="$DIR/$EXPERIMENT.eval";get_treesize;EVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.eval";get_treesize;MIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.eval";get_treesize;REMIEVAL=$TMP

      # if not finished -> set to "-"
      if [ ! -s "$DIR/$EXPERIMENT-stat.arff" ]
      then
         EVAL="-"
      fi
      if [ ! -s "$DIR/$EXPERIMENT-stat-mi.arff" ]
      then
         MIEVAL="-"
      fi
      if [ ! -s "$DIR/$EXPERIMENT-stat-remi.arff" ]
      then
         REMIEVAL="-"
      fi

      # csv-file
      LINE="$EXPERIMENT;$EVAL;$MIEVAL;$REMIEVAL"

      echo -e $LINE >> $TREE
   done
}

# extracts the parameters for the different experiments
# expects:
#    TITLE  what to print to the screen
#    SUFFIX for what file suffix to look for
function print_parameters()
{
   echo "Parameters..."

   if [ ! "$PARAMETERS" = "yes" ]
   then
      # the header is necessary, because the discovery of the names in the
      # Excel sheet will otherwise fail!
      HEADER="Parameters"
   else
      HEADER="Parameters\nexperiment;type;parameters"
   fi
   
   echo -e $HEADER >> $OUTPUT
   echo -e $HEADER >> $OUTPUT_DE

   if [ ! "$PARAMETERS" = "yes" ]
   then
      exit 0;
   fi

   # run over all "stat"-files
   for i in $DIR/*-stat.arff
   do
      FILE=$i

      # zero length?
      if [ ! -s $FILE ]
      then
         continue
      fi

      # progress info
      EXPERIMENT=`echo ${FILE##*/} | cut -f1 -d"." | sed s/-stat//g`
      echo " - $EXPERIMENT"

      # data
      TMPFILE="$DIR/$EXPERIMENT.db";TMP="Databases:";get_parameters;DATABASES=$TMP
      TMPFILE="$DIR/$EXPERIMENT.proper";TMP="Proper:";get_parameters;RUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT.import";TMP="Import:";get_parameters;IMPORT=$TMP
      TMPFILE="$DIR/$EXPERIMENT.export";TMP="Export:";get_parameters;EXPORT=$TMP
      TMPFILE="$DIR/$EXPERIMENT.eval";TMP="Experimenter:";get_parameters;EVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT.class";TMP="Classifier:";get_parameters;CLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT.test";TMP="Tester:";get_parameters;TESTER=$TMP

      TMPFILE="$DIR/$EXPERIMENT-mi.proper";TMP="Flattening:";get_parameters;MIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.eval";TMP="Experimenter:";get_parameters;MIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.class";TMP="Classifier:";get_parameters;MICLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT-mi.test";TMP="Tester:";get_parameters;MITESTER=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.proper";TMP="Flattening:";get_parameters;REMIRUN=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.eval";TMP="Experimenter:";get_parameters;REMIEVAL=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.class";TMP="Classifier:";get_parameters;REMICLASSIFY=$TMP
      TMPFILE="$DIR/$EXPERIMENT-remi.test";TMP="Tester:";get_parameters;REMITESTER=$TMP

      # csv-file
      LINE="$EXPERIMENT;database;\"$DATABASES\"\n"
      LINE=$LINE";import;\"$IMPORT\"\n"
      LINE=$LINE";proper;\"$RUN\"\n"
      LINE=$LINE";export;\"$EXPORT\"\n"
      LINE=$LINE";evaluate;\"$EVAL\"\n"
      LINE=$LINE";classify;\"$CLASSIFY\"\n"
      LINE=$LINE";test;\"$TESTER\"\n"
      LINE=$LINE";mi-proper;\"$MIRUN\"\n"
      LINE=$LINE";mi-evaluate;\"$MIEVAL\"\n"
      LINE=$LINE";mi-classify;\"$MICLASSIFY\"\n"
      LINE=$LINE";mi-test;\"$MITESTER\"\n"
      LINE=$LINE";remi-proper;\"$REMIRUN\"\n"
      LINE=$LINE";remi-evaluate;\"$REMIEVAL\"\n"
      LINE=$LINE";remi-classify;\"$REMICLASSIFY\"\n"
      LINE=$LINE";remi-test;\"$REMITESTER\"\n\n"

      echo -e $LINE >> $OUTPUT
      echo -e $LINE >> $OUTPUT_DE
   done
}

##################
# some variables #
##################

ROOT=`expr "$0" : '\(.*\)/'`
DIR=$ROOT/../tmp
OUTDIR=$DIR
PARAMETERS="no"
PREFIX=experiments
ALL="yes"
CHECK_EXCEPTIONS="no"
CHECK_RUNTIME="no"
CHECK_TREE="no"

# interprete parameters
while getopts ":hperto:i:n:" flag
do
   case $flag in
      i) DIR=$OPTARG
         ;;
      o) OUTDIR=$OPTARG
         ;;
      n) PREFIX=$OPTARG
         ;;
      p) PARAMETERS="yes"
         ;;
      e) CHECK_EXCEPTIONS="yes"
         ALL="no"
         ;;
      r) CHECK_RUNTIME="yes"
         ALL="no"
         ;;
      t) CHECK_TREE="yes"
         ALL="no"
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

OUTPUT=$OUTDIR/$PREFIX.csv
OUTPUT_DE=$OUTDIR/$PREFIX-de.csv
LATEX=$OUTDIR/$PREFIX.tex
LATEX_DE=$OUTDIR/$PREFIX-de.tex
EXCEPTION=$OUTDIR/$PREFIX-exceptions.csv
RUNTIME=$OUTDIR/$PREFIX-runtime.csv
TREE=$OUTDIR/$PREFIX-treesize.csv
ATTR_FILE=$OUTDIR/_attr
DATA_FILE=$OUTDIR/_data
NOBLANK_FILE=$OUTDIR/_noblank
PREFIX=$OUTDIR/xx
DATASETS=$OUTDIR/_datasets

################
# SCRIPT START #
################

# determine datasets
ls $DIR/*-stat*.arff | sed s/^.*"\/"//g | sed s/"-stat".*".arff"//g | sort -u > $DATASETS
TMPFILE=$DATASETS;count_lines;COUNT_DATASETS=$TMP

if [ "$ALL" = "yes" ]
then
   rm -f $OUTPUT
   rm -f $OUTPUT_DE
   rm -f $LATEX
   rm -f $LATEX_DE

   TESTER="no";SUFFIX="";TITLE="RELAGGS-Experimenter";print_experimenter
   TESTER="yes";SUFFIX="";TITLE="RELAGGS-Tester";print_tester

   TESTER="no";SUFFIX="-mi";TITLE="MILK-Experimenter";print_experimenter
   TESTER="yes";SUFFIX="-mi";TITLE="MILK-Tester";print_tester

   TESTER="no";SUFFIX="-remi";TITLE="REMILK-Experimenter";print_experimenter
   TESTER="yes";SUFFIX="-remi";TITLE="REMILK-Tester";print_tester

   print_parameters
fi

if [ "$ALL" = "yes" ] || [ "$CHECK_EXCEPTIONS" = "yes" ]
then
   rm -f $EXCEPTION
   print_exceptions
fi

if [ "$ALL" = "yes" ] || [ "$CHECK_RUNTIME" = "yes" ]
then
   rm -f $RUNTIME
   print_runtime
fi

if [ "$ALL" = "yes" ] || [ "$CHECK_TREE" = "yes" ]
then
   rm -f $TREE
   print_treesize
fi

# clean up additionally
rm -f $DATASETS

