/*
 * Decompiled with CFR 0.152.
 */
package jsat.text;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.HashedTextVectorCreator;
import jsat.text.TextVectorCreator;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;

public abstract class HashedTextDataLoader
implements TextVectorCreator {
    private static final long serialVersionUID = 8513621180409278670L;
    private final int dimensionSize;
    private Tokenizer tokenizer;
    private WordWeighting weighting;
    protected List<SparseVector> vectors;
    private int[] termDocumentFrequencys;
    protected boolean noMoreAdding;
    private int documents;
    protected StringBuilder workSpace;
    protected List<String> storageSpace;
    protected Map<String, Integer> wordCounts;
    private TextVectorCreator tvc;

    public HashedTextDataLoader(Tokenizer tokenizer, WordWeighting weighting) {
        this(0x400000, tokenizer, weighting);
    }

    public HashedTextDataLoader(int dimensionSize, Tokenizer tokenizer, WordWeighting weighting) {
        this.dimensionSize = dimensionSize;
        this.tokenizer = tokenizer;
        this.weighting = weighting;
        this.termDocumentFrequencys = new int[dimensionSize];
        this.vectors = new ArrayList<SparseVector>();
        this.tvc = new HashedTextVectorCreator(dimensionSize, tokenizer, weighting);
        this.noMoreAdding = false;
    }

    protected abstract void initialLoad();

    protected void addOriginalDocument(String text) {
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        if (this.workSpace == null) {
            this.workSpace = new StringBuilder();
            this.storageSpace = new ArrayList<String>();
        }
        this.workSpace.setLength(0);
        this.storageSpace.clear();
        this.tokenizer.tokenize(text, this.workSpace, this.storageSpace);
        if (this.documents % 50 == 0) {
            this.wordCounts = new HashMap<String, Integer>(this.storageSpace.size());
        }
        for (String word : this.storageSpace) {
            Integer count = this.wordCounts.get(word);
            if (count == null) {
                this.wordCounts.put(word, 1);
                continue;
            }
            this.wordCounts.put(word, count + 1);
        }
        SparseVector vec = new SparseVector(this.dimensionSize, this.wordCounts.size());
        Iterator<Map.Entry<String, Integer>> iter = this.wordCounts.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry<String, Integer> entry = iter.next();
            String word = entry.getKey();
            int index = Math.abs(word.hashCode()) % this.dimensionSize;
            vec.set(index, entry.getValue().intValue());
            int n = index;
            this.termDocumentFrequencys[n] = this.termDocumentFrequencys[n] + entry.getValue();
            iter.remove();
        }
        this.vectors.add(vec);
        ++this.documents;
    }

    protected void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        this.weighting.setWeight(this.vectors, IntList.unmodifiableView(this.termDocumentFrequencys, this.dimensionSize));
        for (SparseVector vec : this.vectors) {
            this.weighting.applyTo(vec);
        }
        this.termDocumentFrequencys = null;
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            this.initialLoad();
            this.finishAdding();
        }
        ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>(this.vectors.size());
        for (SparseVector vec : this.vectors) {
            dataPoints.add(new DataPoint(vec, new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(dataPoints);
    }

    @Override
    public Vec newText(String input) {
        return this.getTextVectorCreator().newText(input);
    }

    @Override
    public Vec newText(String input, StringBuilder workSpace, List<String> storageSpace) {
        return this.getTextVectorCreator().newText(input, workSpace, storageSpace);
    }

    public TextVectorCreator getTextVectorCreator() {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.tvc;
    }
}

