#!/usr/bin/env python
#
#  Copyright 2014+ Carnegie Mellon University
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import argparse

from flexneuart.io.utils import jsonl_gen

parser = argparse.ArgumentParser(description='Count tokens and number of entries in JSONL')

parser.add_argument('--input', type=str, required=True)
parser.add_argument('--field', type=str, required=True)

args = parser.parse_args()

qty = 0
tok_qty = 0
field = args.field

for e in jsonl_gen(args.input):
    qty += 1
    if field in e:
        tok_qty += len(e[field].split())


print(f'# of entries {qty} avg. tokens per entry: {tok_qty/qty} total # of tokens: {tok_qty}')
