"""TODO(squad_v2): Add a description here."""

from __future__ import absolute_import, division, print_function

import json
import os

import nlp


# TODO(squad_v2): BibTeX citation
_CITATION = """\
@article{2016arXiv160605250R,
       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
                 Konstantin and {Liang}, Percy},
        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
      journal = {arXiv e-prints},
         year = 2016,
          eid = {arXiv:1606.05250},
        pages = {arXiv:1606.05250},
archivePrefix = {arXiv},
       eprint = {1606.05250},
}
"""

_DESCRIPTION = """\
combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers
 to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but 
 also determine when no answer is supported by the paragraph and abstain from answering.
"""

_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
_DEV_FILE = "dev-v2.0.json"
_TRAINING_FILE = "train-v2.0.json"

class SquadV2Config(nlp.BuilderConfig):
  """BuilderConfig for SQUAD."""

  def __init__(self, **kwargs):
    """BuilderConfig for SQUADV2.

    Args:
      **kwargs: keyword arguments forwarded to super.
    """
    super(SquadV2Config, self).__init__(**kwargs)
      
      
class SquadV2(nlp.GeneratorBasedBuilder):
  """TODO(squad_v2): Short description of my dataset."""

  # TODO(squad_v2): Set up version.
  BUILDER_CONFIGS = [
      SquadV2Config(
          name="squad_v2",
          version=nlp.Version(
              "2.0.0"),
          description='SQuAD plaint text version 2'
      ),
  ]
  def _info(self):
    # TODO(squad_v2): Specifies the nlp.DatasetInfo object
    return nlp.DatasetInfo(
        # This is the description that will appear on the datasets page.
        description=_DESCRIPTION,
        # nlp.features.FeatureConnectors
        features=nlp.Features({
            "id":
                nlp.Value('string'),
            "title":
                nlp.Value('string'),
            "context":
                nlp.Value('string'),
            "question":
                nlp.Value('string'),
            "answers":
                nlp.features.Sequence({
                    "text": nlp.Value('string'),
                    "answer_start": nlp.Value('int32'),
                }),
            # These are the features of your dataset like images, labels ...
        }),
        # If there's a common (input, target) tuple from the features,
        # specify them here. They'll be used if as_supervised=True in
        # builder.as_dataset.
        supervised_keys=None,
        # Homepage of the dataset for documentation
        homepage='https://rajpurkar.github.io/SQuAD-explorer/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager):
    """Returns SplitGenerators."""
    # TODO(squad_v2): Downloads the data and defines the splits
    # dl_manager is a nlp.download.DownloadManager that can be used to
    # download and extract URLs
    urls_to_download = {
        "train": os.path.join(_URL, _TRAINING_FILE),
        "dev": os.path.join(_URL, _DEV_FILE)
    }
    downloaded_files = dl_manager.download_and_extract(urls_to_download)

    return [
        nlp.SplitGenerator(
            name=nlp.Split.TRAIN,
            gen_kwargs={"filepath": downloaded_files["train"]}),
        nlp.SplitGenerator(
            name=nlp.Split.VALIDATION,
            gen_kwargs={"filepath": downloaded_files["dev"]}),
    ]


  def _generate_examples(self, filepath):
    """Yields examples."""
    # TODO(squad_v2): Yields (key, example) tuples from the dataset
    with open(filepath) as f:
        squad = json.load(f)
        for example in squad["data"]:
            title = example.get("title", "").strip()
            for paragraph in example["paragraphs"]:
                context = paragraph["context"].strip()
                for qa in paragraph["qas"]:
                    question = qa["question"].strip()
                    id_ = qa["id"]

                    answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                    answers = [answer["text"].strip() for answer in qa["answers"]]

                    # Features currently used are "context", "question", and "answers".
                    # Others are extracted here for the ease of future expansions.
                    yield id_, {
                        "title": title,
                        "context": context,
                        "question": question,
                        "id": id_,
                        "answers": {
                            "answer_start": answer_starts,
                            "text": answers,
                        },
                    }
