.gitattributes
.gitignore
.travis.yml
CHANGELOG.md
DOCKER.md
Dockerfile
LICENSE
MANIFEST.in
README.md
bashrc
download_corpora.py
requirements.txt
setup.py
.github/ISSUE_TEMPLATE.md
.github/PULL_REQUEST_TEMPLATE.md
scraper/__init__.py
scraper/article.py
scraper/chromium.py
scraper/cli.py
scraper/configuration.py
scraper/content_extractor.py
scraper/document_cleaner.py
scraper/image_extractor.py
scraper/mthreading.py
scraper/network.py
scraper/nlp.py
scraper/output_formatter.py
scraper/parser.py
scraper/settings.py
scraper/source.py
scraper/sources.py
scraper/text.py
scraper/urls.py
scraper/utils.py
scraper/version.py
scraper/video_extractor.py
scraper/resources/bootstrap_iso_stopwords
scraper/resources/misc/stopwords-nlp-en.txt
scraper/resources/misc/useragents.txt
scraper/resources/text/stopwords-af.txt
scraper/resources/text/stopwords-ar.txt
scraper/resources/text/stopwords-be.txt
scraper/resources/text/stopwords-bg.txt
scraper/resources/text/stopwords-bn.txt
scraper/resources/text/stopwords-br.txt
scraper/resources/text/stopwords-ca.txt
scraper/resources/text/stopwords-cs.txt
scraper/resources/text/stopwords-da.txt
scraper/resources/text/stopwords-de.txt
scraper/resources/text/stopwords-el.txt
scraper/resources/text/stopwords-en.txt
scraper/resources/text/stopwords-eo.txt
scraper/resources/text/stopwords-es.txt
scraper/resources/text/stopwords-et.txt
scraper/resources/text/stopwords-eu.txt
scraper/resources/text/stopwords-fa.txt
scraper/resources/text/stopwords-fi.txt
scraper/resources/text/stopwords-fr.txt
scraper/resources/text/stopwords-ga.txt
scraper/resources/text/stopwords-gl.txt
scraper/resources/text/stopwords-gu.txt
scraper/resources/text/stopwords-ha.txt
scraper/resources/text/stopwords-he.txt
scraper/resources/text/stopwords-hi.txt
scraper/resources/text/stopwords-hr.txt
scraper/resources/text/stopwords-hu.txt
scraper/resources/text/stopwords-hy.txt
scraper/resources/text/stopwords-id.txt
scraper/resources/text/stopwords-it.txt
scraper/resources/text/stopwords-ja.txt
scraper/resources/text/stopwords-ka.txt
scraper/resources/text/stopwords-ko.txt
scraper/resources/text/stopwords-ku.txt
scraper/resources/text/stopwords-la.txt
scraper/resources/text/stopwords-lt.txt
scraper/resources/text/stopwords-lv.txt
scraper/resources/text/stopwords-mk.txt
scraper/resources/text/stopwords-mr.txt
scraper/resources/text/stopwords-ms.txt
scraper/resources/text/stopwords-nb.txt
scraper/resources/text/stopwords-nl.txt
scraper/resources/text/stopwords-no.txt
scraper/resources/text/stopwords-np.txt
scraper/resources/text/stopwords-pl.txt
scraper/resources/text/stopwords-pt.txt
scraper/resources/text/stopwords-ro.txt
scraper/resources/text/stopwords-ru.txt
scraper/resources/text/stopwords-sk.txt
scraper/resources/text/stopwords-sl.txt
scraper/resources/text/stopwords-so.txt
scraper/resources/text/stopwords-sr.txt
scraper/resources/text/stopwords-st.txt
scraper/resources/text/stopwords-sv.txt
scraper/resources/text/stopwords-sw.txt
scraper/resources/text/stopwords-ta.txt
scraper/resources/text/stopwords-th.txt
scraper/resources/text/stopwords-tl.txt
scraper/resources/text/stopwords-tr.txt
scraper/resources/text/stopwords-uk.txt
scraper/resources/text/stopwords-ur.txt
scraper/resources/text/stopwords-vi.txt
scraper/resources/text/stopwords-yo.txt
scraper/resources/text/stopwords-zh.txt
scraper/resources/text/stopwords-zu.txt
stimson_web_scraper.egg-info/PKG-INFO
stimson_web_scraper.egg-info/SOURCES.txt
stimson_web_scraper.egg-info/dependency_links.txt
stimson_web_scraper.egg-info/entry_points.txt
stimson_web_scraper.egg-info/not-zip-safe
stimson_web_scraper.egg-info/requires.txt
stimson_web_scraper.egg-info/top_level.txt
tests/__init__.py
tests/conftest.py
tests/fixtures/20170914-tpch-egm201701-enc02-th.pdf
tests/fixtures/20170914-tpch-egm201701-enc02-th.url
tests/fixtures/Oxford_University_Press.html
tests/fixtures/Oxford_University_Press.url
tests/fixtures/books.google.com.html
tests/fixtures/books.google.com.url
tests/fixtures/fulltext_domain_list.txt
tests/fixtures/fulltext_url_list.txt
tests/fixtures/google_scholar.html
tests/fixtures/news_ycombinator.html
tests/fixtures/test_prepare_urls.txt
tests/fixtures/test_urls.txt
tests/fixtures/test_urls_pubdate.txt
tests/fixtures/www.econstor.eu.pdf
tests/fixtures/www.econstor.eu.url
tests/fixtures/energy_investment_mekong_delta/Thailand.url
tests/fixtures/html/247wallst.com1.html
tests/fixtures/html/247wallst.com2.html
tests/fixtures/html/about.com1.html
tests/fixtures/html/about.com2.html
tests/fixtures/html/adoption.com1.html
tests/fixtures/html/al.com1.html
tests/fixtures/html/al.com2.html
tests/fixtures/html/ap_meta_refresh.html
tests/fixtures/html/apartmenttherapy.com1.html
tests/fixtures/html/apartmenttherapy.com2.html
tests/fixtures/html/arabic_article.html
tests/fixtures/html/architecturaldigest.com1.html
tests/fixtures/html/architecturaldigest.com2.html
tests/fixtures/html/avclub.com1.html
tests/fixtures/html/avclub.com2.html
tests/fixtures/html/backstage.com1.html
tests/fixtures/html/backstage.com2.html
tests/fixtures/html/bhg.com1.html
tests/fixtures/html/bhg.com2.html
tests/fixtures/html/bloomberg.com1.html
tests/fixtures/html/bostonherald.com1.html
tests/fixtures/html/bostonherald.com2.html
tests/fixtures/html/businessinsider.com1.html
tests/fixtures/html/businessinsider.com2.html
tests/fixtures/html/businessweek.com1.html
tests/fixtures/html/businessweek.com2.html
tests/fixtures/html/chinese_article.html
tests/fixtures/html/cleveland.com1.html
tests/fixtures/html/cleveland.com2.html
tests/fixtures/html/cnn_article.html
tests/fixtures/html/cnn_main_site.html
tests/fixtures/html/cntraveler.com1.html
tests/fixtures/html/cntraveler.com2.html
tests/fixtures/html/coolhunting.com1.html
tests/fixtures/html/coolhunting.com2.html
tests/fixtures/html/cricket.com.au1.html
tests/fixtures/html/cricket.com.au2.html
tests/fixtures/html/dailycaller.com1.html
tests/fixtures/html/dailycaller.com2.html
tests/fixtures/html/dailystar.co.uk1.html
tests/fixtures/html/dailystar.co.uk2.html
tests/fixtures/html/dallasnews.com1.html
tests/fixtures/html/dallasnews.com2.html
tests/fixtures/html/details.com1.html
tests/fixtures/html/details.com2.html
tests/fixtures/html/elle.com1.html
tests/fixtures/html/elle.com2.html
tests/fixtures/html/flavorwire.com1.html
tests/fixtures/html/flavorwire.com2.html
tests/fixtures/html/fool.com1.html
tests/fixtures/html/fool.com2.html
tests/fixtures/html/foxbusiness.com1.html
tests/fixtures/html/foxbusiness.com2.html
tests/fixtures/html/foxnews.com1.html
tests/fixtures/html/foxnews.com2.html
tests/fixtures/html/glamour.com1.html
tests/fixtures/html/glamour.com2.html
tests/fixtures/html/globalnews.ca1.html
tests/fixtures/html/globalnews.ca2.html
tests/fixtures/html/google_meta_refresh.html
tests/fixtures/html/gq.com1.html
tests/fixtures/html/gq.com2.html
tests/fixtures/html/graziadaily.co.uk1.html
tests/fixtures/html/graziadaily.co.uk2.html
tests/fixtures/html/gulflive.com1.html
tests/fixtures/html/gulflive.com2.html
tests/fixtures/html/huffingtonpost.com1.html
tests/fixtures/html/japanese_article.html
tests/fixtures/html/japanese_article2.html
tests/fixtures/html/lifebuzz.com1.html
tests/fixtures/html/lifebuzz.com2.html
tests/fixtures/html/livescience.com1.html
tests/fixtures/html/livescience.com2.html
tests/fixtures/html/mashable.com1.html
tests/fixtures/html/mashable.com2.html
tests/fixtures/html/mlive.com1.html
tests/fixtures/html/mlive.com2.html
tests/fixtures/html/newyorker.com1.html
tests/fixtures/html/nj.com1.html
tests/fixtures/html/nola.com1.html
tests/fixtures/html/nydailynews.com1.html
tests/fixtures/html/nypost.com1.html
tests/fixtures/html/nypost.com2.html
tests/fixtures/html/ok.co.uk1.html
tests/fixtures/html/ok.co.uk2.html
tests/fixtures/html/oregonlive.com1.html
tests/fixtures/html/oregonlive.com2.html
tests/fixtures/html/parsely.com1.html
tests/fixtures/html/parsely.com2.html
tests/fixtures/html/pe.com1.html
tests/fixtures/html/pewresearch.org1.html
tests/fixtures/html/pewresearch.org2.html
tests/fixtures/html/pixable.com1.html
tests/fixtures/html/pixable.com2.html
tests/fixtures/html/pixelmonkey.org1.html
tests/fixtures/html/pixelmonkey.org2.html
tests/fixtures/html/readwrite.com1.html
tests/fixtures/html/recipe.com1.html
tests/fixtures/html/recipe.com2.html
tests/fixtures/html/reuters.com1.html
tests/fixtures/html/reuters.com2.html
tests/fixtures/html/reuters.com3.html
tests/fixtures/html/reuters.com4.html
tests/fixtures/html/self.com1.html
tests/fixtures/html/self.com2.html
tests/fixtures/html/sitepoint.com1.html
tests/fixtures/html/sitepoint.com2.html
tests/fixtures/html/slate.com1.html
tests/fixtures/html/slate.com2.html
tests/fixtures/html/space.com1.html
tests/fixtures/html/space.com2.html
tests/fixtures/html/spanish_article.html
tests/fixtures/html/syracuse.com1.html
tests/fixtures/html/syracuse.com2.html
tests/fixtures/html/talkingpointsmemo.com1.html
tests/fixtures/html/technologyreview.com1.html
tests/fixtures/html/technologyreview.com2.html
tests/fixtures/html/teenvogue.com1.html
tests/fixtures/html/teenvogue.com2.html
tests/fixtures/html/telegraph.co.uk1.html
tests/fixtures/html/telegraph.co.uk2.html
tests/fixtures/html/thai_article.html
tests/fixtures/html/theatlantic.com1.html
tests/fixtures/html/theatlantic.com2.html
tests/fixtures/html/theatlanticcities.com1.html
tests/fixtures/html/theatlanticcities.com2.html
tests/fixtures/html/thedailybeast.com1.html
tests/fixtures/html/thedailybeast.com2.html
tests/fixtures/html/thedebrief.co.uk1.html
tests/fixtures/html/thedebrief.co.uk2.html
tests/fixtures/html/theglobeandmail.com1.html
tests/fixtures/html/theglobeandmail.com2.html
tests/fixtures/html/thekitchn.com1.html
tests/fixtures/html/thekitchn.com2.html
tests/fixtures/html/thenextweb.com1.html
tests/fixtures/html/theonion.com1.html
tests/fixtures/html/theroot.com1.html
tests/fixtures/html/tnr.com1.html
tests/fixtures/html/tnr.com2.html
tests/fixtures/html/uproxx.com1.html
tests/fixtures/html/uproxx.com2.html
tests/fixtures/html/upworthy.com1.html
tests/fixtures/html/upworthy.com2.html
tests/fixtures/html/usnews.com1.html
tests/fixtures/html/usnews.com2.html
tests/fixtures/html/vanityfair.com1.html
tests/fixtures/html/vogue.de1.html
tests/fixtures/html/vogue.de2.html
tests/fixtures/html/wetpaint.com1.html
tests/fixtures/html/wetpaint.com2.html
tests/fixtures/html/wired.com1.html
tests/fixtures/html/wired.com2.html
tests/fixtures/html/wnet.org1.html
tests/fixtures/html/wnet.org2.html
tests/fixtures/html/yahoo_main_site.html
tests/fixtures/html/youbeauty.com1.html
tests/fixtures/html/youbeauty.com2.html
tests/fixtures/text/247wallst.com1.txt
tests/fixtures/text/247wallst.com2.txt
tests/fixtures/text/about.com1.txt
tests/fixtures/text/about.com2.txt
tests/fixtures/text/adoption.com1.txt
tests/fixtures/text/al.com1.txt
tests/fixtures/text/al.com2.txt
tests/fixtures/text/apartmenttherapy.com1.txt
tests/fixtures/text/apartmenttherapy.com2.txt
tests/fixtures/text/arabic.txt
tests/fixtures/text/architecturaldigest.com1.txt
tests/fixtures/text/architecturaldigest.com2.txt
tests/fixtures/text/avclub.com1.txt
tests/fixtures/text/avclub.com2.txt
tests/fixtures/text/backstage.com1.txt
tests/fixtures/text/backstage.com2.txt
tests/fixtures/text/bhg.com1.txt
tests/fixtures/text/bhg.com2.txt
tests/fixtures/text/bloomberg.com1.txt
tests/fixtures/text/bostonherald.com1.txt
tests/fixtures/text/bostonherald.com2.txt
tests/fixtures/text/businessinsider.com1.txt
tests/fixtures/text/businessinsider.com2.txt
tests/fixtures/text/businessweek.com1.txt
tests/fixtures/text/businessweek.com2.txt
tests/fixtures/text/chinese.txt
tests/fixtures/text/cleveland.com1.txt
tests/fixtures/text/cleveland.com2.txt
tests/fixtures/text/cnn.txt
tests/fixtures/text/cnn_summary.txt
tests/fixtures/text/cntraveler.com1.txt
tests/fixtures/text/cntraveler.com2.txt
tests/fixtures/text/coolhunting.com1.txt
tests/fixtures/text/cricket.com.au1.txt
tests/fixtures/text/cricket.com.au2.txt
tests/fixtures/text/dailycaller.com1.txt
tests/fixtures/text/dailycaller.com2.txt
tests/fixtures/text/dailystar.co.uk1.txt
tests/fixtures/text/dailystar.co.uk2.txt
tests/fixtures/text/dallasnews.com1.txt
tests/fixtures/text/dallasnews.com2.txt
tests/fixtures/text/details.com1.txt
tests/fixtures/text/details.com2.txt
tests/fixtures/text/elle.com1.txt
tests/fixtures/text/elle.com2.txt
tests/fixtures/text/flavorwire.com1.txt
tests/fixtures/text/flavorwire.com2.txt
tests/fixtures/text/fool.com1.txt
tests/fixtures/text/fool.com2.txt
tests/fixtures/text/foxbusiness.com1.txt
tests/fixtures/text/foxbusiness.com2.txt
tests/fixtures/text/foxnews.com1.txt
tests/fixtures/text/foxnews.com2.txt
tests/fixtures/text/foxnews.com3.txt
tests/fixtures/text/foxnews.com4.txt
tests/fixtures/text/glamour.com1.txt
tests/fixtures/text/glamour.com2.txt
tests/fixtures/text/globalnews.ca1.txt
tests/fixtures/text/globalnews.ca2.txt
tests/fixtures/text/gq.com1.txt
tests/fixtures/text/gq.com2.txt
tests/fixtures/text/graziadaily.co.uk1.txt
tests/fixtures/text/graziadaily.co.uk2.txt
tests/fixtures/text/gulflive.com1.txt
tests/fixtures/text/gulflive.com2.txt
tests/fixtures/text/huffingtonpost.com1.txt
tests/fixtures/text/japanese.txt
tests/fixtures/text/japanese2.txt
tests/fixtures/text/lifebuzz.com1.txt
tests/fixtures/text/lifebuzz.com2.txt
tests/fixtures/text/livescience.com1.txt
tests/fixtures/text/livescience.com2.txt
tests/fixtures/text/mashable.com1.txt
tests/fixtures/text/mashable.com2.txt
tests/fixtures/text/mlive.com1.txt
tests/fixtures/text/mlive.com2.txt
tests/fixtures/text/newyorker.com1.txt
tests/fixtures/text/nj.com1.txt
tests/fixtures/text/nola.com1.txt
tests/fixtures/text/nydailynews.com1.txt
tests/fixtures/text/nypost.com1.txt
tests/fixtures/text/nypost.com2.txt
tests/fixtures/text/ok.co.uk1.txt
tests/fixtures/text/ok.co.uk2.txt
tests/fixtures/text/oregonlive.com1.txt
tests/fixtures/text/oregonlive.com2.txt
tests/fixtures/text/parsely.com1.txt
tests/fixtures/text/parsely.com2.txt
tests/fixtures/text/pe.com1.txt
tests/fixtures/text/pewresearch.org1.txt
tests/fixtures/text/pewresearch.org2.txt
tests/fixtures/text/pixable.com1.txt
tests/fixtures/text/pixable.com2.txt
tests/fixtures/text/pixelmonkey.org1.txt
tests/fixtures/text/pixelmonkey.org2.txt
tests/fixtures/text/readwrite.com1.txt
tests/fixtures/text/recipe.com1.txt
tests/fixtures/text/recipe.com2.txt
tests/fixtures/text/reuters.com1.txt
tests/fixtures/text/reuters.com2.txt
tests/fixtures/text/reuters.com3.txt
tests/fixtures/text/reuters.com4.txt
tests/fixtures/text/reuters.com5.txt
tests/fixtures/text/reuters.com6.txt
tests/fixtures/text/self.com1.txt
tests/fixtures/text/self.com2.txt
tests/fixtures/text/sitepoint.com1.txt
tests/fixtures/text/sitepoint.com2.txt
tests/fixtures/text/slate.com1.txt
tests/fixtures/text/slate.com2.txt
tests/fixtures/text/space.com1.txt
tests/fixtures/text/space.com2.txt
tests/fixtures/text/spanish.txt
tests/fixtures/text/syracuse.com1.txt
tests/fixtures/text/syracuse.com2.txt
tests/fixtures/text/talkingpointsmemo.com1.txt
tests/fixtures/text/technologyreview.com1.txt
tests/fixtures/text/technologyreview.com2.txt
tests/fixtures/text/teenvogue.com1.txt
tests/fixtures/text/teenvogue.com2.txt
tests/fixtures/text/telegraph.co.uk1.txt
tests/fixtures/text/telegraph.co.uk2.txt
tests/fixtures/text/thai.txt
tests/fixtures/text/theatlantic.com1.txt
tests/fixtures/text/theatlantic.com2.txt
tests/fixtures/text/theatlanticcities.com1.txt
tests/fixtures/text/theatlanticcities.com2.txt
tests/fixtures/text/thedailybeast.com1.txt
tests/fixtures/text/thedailybeast.com2.txt
tests/fixtures/text/thedebrief.co.uk1.txt
tests/fixtures/text/thedebrief.co.uk2.txt
tests/fixtures/text/theglobeandmail.com1.txt
tests/fixtures/text/theglobeandmail.com2.txt
tests/fixtures/text/thekitchn.com1.txt
tests/fixtures/text/thekitchn.com2.txt
tests/fixtures/text/thenextweb.com1.txt
tests/fixtures/text/theonion.com1.txt
tests/fixtures/text/theroot.com1.txt
tests/fixtures/text/tnr.com1.txt
tests/fixtures/text/tnr.com2.txt
tests/fixtures/text/uproxx.com1.txt
tests/fixtures/text/uproxx.com2.txt
tests/fixtures/text/upworthy.com1.txt
tests/fixtures/text/upworthy.com2.txt
tests/fixtures/text/usnews.com1.txt
tests/fixtures/text/usnews.com2.txt
tests/fixtures/text/vanityfair.com1.txt
tests/fixtures/text/vogue.de1.txt
tests/fixtures/text/vogue.de2.txt
tests/fixtures/text/wetpaint.com1.txt
tests/fixtures/text/wetpaint.com2.txt
tests/fixtures/text/wired.com1.txt
tests/fixtures/text/wired.com2.txt
tests/fixtures/text/wnet.org1.txt
tests/fixtures/text/wnet.org2.txt
tests/fixtures/text/youbeauty.com1.txt
tests/fixtures/text/youbeauty.com2.txt
tests/system/__init__.py
tests/system/test_energy_investment.py
tests/units/__init__.py
tests/units/test_article.py
tests/units/test_configuration.py
tests/units/test_content_extractor.py
tests/units/test_exhaustive.py
tests/units/test_mthreading.py
tests/units/test_multilanguage.py
tests/units/test_pdf.py
tests/units/test_source.py
tests/units/test_sources.py
tests/units/test_url.py
tests/units/test_utils.py