/*
 * Decompiled with CFR 0.152.
 */
package jsat.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.datatransform.RemoveAttributeTransform;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.BasicTextVectorCreator;
import jsat.text.TextVectorCreator;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;
import jsat.utils.IntSet;

public abstract class TextDataLoader
implements TextVectorCreator {
    private static final long serialVersionUID = -657254682338792871L;
    protected List<SparseVector> vectors = new ArrayList<SparseVector>();
    protected Tokenizer tokenizer;
    protected Map<String, Integer> wordIndex;
    protected List<String> allWords;
    protected List<Integer> termDocumentFrequencys;
    private WordWeighting weighting;
    protected StringBuilder workSpace;
    protected List<String> storageSpace;
    protected Map<String, Integer> wordCounts;
    private TextVectorCreator tvc;
    protected boolean noMoreAdding;
    private int currentLength = 0;
    private int documents;

    public TextDataLoader(Tokenizer tokenizer, WordWeighting weighting) {
        this.tokenizer = tokenizer;
        this.wordIndex = new HashMap<String, Integer>();
        this.termDocumentFrequencys = new IntList();
        this.weighting = weighting;
        this.allWords = new ArrayList<String>();
        this.noMoreAdding = false;
    }

    public abstract void initialLoad();

    protected void addOriginalDocument(String text) {
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        if (this.workSpace == null) {
            this.workSpace = new StringBuilder();
            this.storageSpace = new ArrayList<String>();
            this.wordCounts = new HashMap<String, Integer>();
        }
        this.workSpace.setLength(0);
        this.storageSpace.clear();
        this.tokenizer.tokenize(text, this.workSpace, this.storageSpace);
        if (this.documents % 50 == 0) {
            this.wordCounts = new HashMap<String, Integer>(this.storageSpace.size() * 3 / 2);
        }
        for (String word : this.storageSpace) {
            Integer count = this.wordCounts.get(word);
            if (count == null) {
                this.wordCounts.put(word, 1);
                continue;
            }
            this.wordCounts.put(word, count + 1);
        }
        SparseVector vec = new SparseVector(this.currentLength + 1, this.wordCounts.size());
        Iterator<Map.Entry<String, Integer>> iter = this.wordCounts.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry<String, Integer> entry = iter.next();
            String word = entry.getKey();
            Integer indx = this.wordIndex.get(word);
            if (indx == null) {
                this.allWords.add(word);
                this.wordIndex.put(word, this.currentLength++);
                this.termDocumentFrequencys.add(1);
                vec.setLength(this.currentLength);
                vec.set(this.currentLength - 1, entry.getValue().intValue());
            } else {
                this.termDocumentFrequencys.set(indx, this.termDocumentFrequencys.get(indx) + 1);
                vec.set(indx, entry.getValue().intValue());
            }
            iter.remove();
        }
        this.vectors.add(vec);
        ++this.documents;
    }

    protected void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        this.weighting.setWeight(this.vectors, this.termDocumentFrequencys);
        for (SparseVector vec : this.vectors) {
            vec.setLength(this.currentLength);
            this.weighting.applyTo(vec);
        }
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            this.initialLoad();
            this.finishAdding();
        }
        ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>(this.vectors.size());
        for (SparseVector vec : this.vectors) {
            dataPoints.add(new DataPoint(vec, new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(dataPoints);
    }

    @Override
    public Vec newText(String text) {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.getTextVectorCreator().newText(text);
    }

    @Override
    public Vec newText(String input, StringBuilder workSpace, List<String> storageSpace) {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.getTextVectorCreator().newText(input, workSpace, storageSpace);
    }

    public TextVectorCreator getTextVectorCreator() {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        if (this.tvc == null) {
            this.tvc = new BasicTextVectorCreator(this.tokenizer, this.wordIndex, this.weighting);
        }
        return this.tvc;
    }

    public String getWordForIndex(int index) {
        if (index >= 0 && index < this.allWords.size()) {
            return this.allWords.get(index);
        }
        return null;
    }

    public int getTermFrequency(int index) {
        return this.termDocumentFrequencys.get(index);
    }

    public RemoveAttributeTransform.RemoveAttributeTransformFactory getMinimumOccurrenceDTF(int minCount) {
        IntSet numericToRemove = new IntSet();
        for (int i = 0; i < this.termDocumentFrequencys.size(); ++i) {
            if (this.termDocumentFrequencys.get(i) >= minCount) continue;
            numericToRemove.add(i);
        }
        return new RemoveAttributeTransform.RemoveAttributeTransformFactory(Collections.EMPTY_SET, numericToRemove);
    }
}

