# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/1.3_exp.csnc_python.ipynb (unless otherwise specified).

__all__ = ['jsonl_list_to_dataframe', 'plain_json_list_to_dataframe', 'columns_long_list', 'columns_short_list']

# Cell
columns_long_list = ['repo', 'path', 'url', 'code',
                     'code_tokens', 'docstring', 'docstring_tokens',
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens',
                      'language', 'partition']

# export
def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records',
                                   compression='gzip',
                                   lines=True)[columns]
                      for f in file_list], sort=False)

# export
def plain_json_list_to_dataframe(file_list, columns):
    '''Load a list of jsonl files into a pandas DaraFrame.'''
    return pd.concat([pd.read_json(f,
                                   orients='records',
                                   compression=None,
                                   lines=True)[columns]
                         for f in file_list], sort=False)

# Cell
# Imports
import dit
import math
import os
import logging

import matplotlib.pyplot as plt
import pandas as pd
import sentencepiece as sp

from collections import Counter
from pathlib import Path
from scipy.stats import sem, t
from statistics import mean, median, stdev
from tqdm.notebook import tqdm

# ds4se
from ...mgmnt.prep.bpe import *
from ..info import *
from ...desc.stats import *