#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""Dataload missingreferences command"""

import dataclasses
import json
import os
import re
import urllib
from typing import List, Set, Tuple
from typing import re as regex

import click

from osducli.click_cli import CustomClickCommand, State, command_with_output
from osducli.cliclient import handle_cli_exceptions
from osducli.commands.dataload.verify import batch_verify
from osducli.config import (
    CONFIG_ACL_OWNER,
    CONFIG_ACL_VIEWER,
    CONFIG_LEGAL_TAG,
    CLIConfig,
)
from osducli.log import get_logger
from osducli.util.file import get_files_from_path

logger = get_logger(__name__)

DATA_SECTION = "Data"
DATASETS_SECTION = "Datasets"
MASTER_DATA_SECTION = "MasterData"
REFERENCE_DATA_SECTION = "ReferenceData"
WORK_PRODUCT_SECTION = "WorkProduct"
WORK_PRODUCT_COMPONENTS_SECTION = "WorkProductComponents"


# click entry point
@click.command(cls=CustomClickCommand)
@click.option(
    "-p",
    "--path",
    help="Path to a manifest file or files to check.",
    type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True),
    required=True,
)
@click.option(
    "-g",
    "--generated",
    help="Path where manifests for missing references should be created.",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, writable=True, resolve_path=True),
    required=False,
)
@click.option(
    "-a",
    "--authority",
    help="Schema authority to use when generating manifest files.",
    default="osdu",
    show_default=True,
    required=False,
)
@click.option(
    "-aclo",
    "--acl-owners",
    help="Acl owners to use when generating manifest files. If not specified the global value is used.",
)
@click.option(
    "-aclv",
    "--acl-viewers",
    help="Acl viewers to use when generating manifest files. If not specified the global value is used.",
)
@click.option(
    "-l",
    "--legal-tags",
    help="Legal tag to use when generating manifest files. If not specified the global value is used.",
)
# @click.option("-b", "--batch", help="Batch size.", type=int, default=200, show_default=True)
# @click.option(
#     "--batch-across-files",
#     is_flag=True,
#     default=True,
#     help="Create batches across files for speed.",
#     show_default=True,
# )
@handle_cli_exceptions
@command_with_output(None)
def _click_command(
    state: State,
    path: str,
    generated: str,
    authority: str,
    acl_owners: str,
    acl_viewers: str,
    legal_tags: str,
):
    """Find any referenced id's missing in OSDU (work in progress).

    This command will try and extract all referenced id's in the given manifests and then check
    whether these exist in OSDU. You might typically use this command prior to ingestion to check
    whether ingestion might fail due to the referential integrity check."""
    return check_references(
        state,
        path,
        generated,
        authority,
        acl_owners,
        acl_viewers,
        legal_tags,
    )


# region Create Manifests
def create_reference_data_manifest_json(
    data_partition: str,
    kind: str,
    code: str,
    authority: str,
    acl_owners: str,
    acl_viewers: str,
    legal_tags: str,
):
    """
    Create a manifest json representation

    """
    data = {
        "kind": f"{data_partition}:wks:Manifest:1.0.0",
        "ReferenceData": [
            {
                "id": f"{data_partition}:{kind}:{code}",
                "kind": f"{authority}:wks:{kind}:1.0.0",
                "acl": {
                    "owners": [acl_owners],
                    "viewers": [acl_viewers],
                },
                "legal": {
                    "legaltags": [legal_tags],
                    "otherRelevantDataCountries": ["US"],
                },
                "data": {
                    "Name": f"{code}",
                    "Description": "",
                    "ID": f"{code}",
                    "Code": f"{code}",
                    "InactiveIndicator": False,
                    "AttributionAuthority": "Equinor",
                    "AttributionPublication": "Equinor Automatic Reference Data Dictionary V1.0",
                    "AttributionRevision": "1.0",
                    "Source": "Automatically generated by OSDU CLI",
                },
            }
        ],
    }

    # Add schema specific fields
    if kind == "reference-data--UnitOfMeasure":
        data["ReferenceData"][0]["data"].update(
            {
                "BaseForConversion": "e.g. cd/m2",
                "MemberUnits": ["e.g. cd/m2"],
                "ParentUnitQuantity": "e.g. J/L2",
                "PersistableReference": '{"ancestry":"e.g. J/L2.luminance","type":"e.g. UM"}',
                "UnitDimension": "e.g. J/L2",
            }
        )

    return data


def create_master_data_manifest_json(
    data_partition: str,
    kind: str,
    code: str,
    authority: str,
    acl_owners: str,
    acl_viewers: str,
    legal_tags: str,
):
    """
    Create a manifest json representation

    """
    data = {
        "kind": f"{data_partition}:wks:Manifest:1.0.0",
        "MasterData": [
            {
                "id": f"{data_partition}:{kind}:{code}",
                "kind": f"{authority}:wks:{kind}:1.0.0",
                "acl": {
                    "owners": [acl_owners],
                    "viewers": [acl_viewers],
                },
                "legal": {
                    "legaltags": [legal_tags],
                    "otherRelevantDataCountries": ["US"],
                },
                "data": {},
            }
        ],
    }

    # Add schema specific fields
    if kind == "master-data--Organisation":
        data["MasterData"][0]["data"].update({"OrganisationName": f"{urllib.parse.unquote(code)}"})
    elif kind == "master-data--GeoPoliticalEntity":
        data["MasterData"][0]["data"].update(
            {"GeoPoliticalEntityName": f"{urllib.parse.unquote(code)}"}
        )
    elif kind == "master-data--Field":
        data["MasterData"][0]["data"].update({"FieldName": f"{urllib.parse.unquote(code)}"})
    elif kind == "master-data--Seismic3DInterpretationSet":
        data["MasterData"][0]["data"].update({"ProjectName": f"{urllib.parse.unquote(code)}"})
    else:
        logger.warning(
            "%s not handled and will likely fail upload with empty data element exception.", kind
        )

    return data


def write_manifest(kind, name, output_dir, manifest):
    """Write the specified manifest to disk

    Args:
        kind ([type]): [description]
        name ([type]): [description]
        output_dir ([type]): [description]
        manifest ([type]): [description]
    """
    folder = os.path.join(output_dir, kind)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(folder, f"{name}.json"), "w") as outfile:
        json.dump(manifest, outfile, indent=2)


def create_reference_data_manifest(
    data_partition: str,
    kind: str,
    name: str,
    authority: str,
    acl_owners: str,
    acl_viewers: str,
    legal_tags: str,
    output_dir: str,
    # , batch_size: int, batch_across_files: bool
):
    """Create a reference data manifest file

    Args:
        data_partition (str): [description]
        kind (str): [description]
        name (str): [description]
        output_dir (str): [description]
    """
    manifest = create_reference_data_manifest_json(
        data_partition, kind, name, authority, acl_owners, acl_viewers, legal_tags
    )
    write_manifest(kind, name, output_dir, manifest)


def create_master_data_manifest(
    data_partition: str,
    kind: str,
    name: str,
    authority: str,
    acl_owners: str,
    acl_viewers: str,
    legal_tags: str,
    output_dir: str,
):
    """Create a master data manifest file

    Args:
        data_partition (str): [description]
        kind (str): [description]
        name (str): [description]
        output_dir (str): [description]
    """
    manifest = create_master_data_manifest_json(
        data_partition, kind, name, authority, acl_owners, acl_viewers, legal_tags
    )
    write_manifest(kind, name, output_dir, manifest)


# endregion Create Manifests


@dataclasses.dataclass
class EntityId:
    """Data class to hold id and version"""

    id: str  # pylint: disable=invalid-name
    version: str = ""

    @property
    def srn(self):
        """Get string representation"""
        return f"{self.id}:{self.version}"


def split_id(id_value: str) -> EntityId:
    """
    Get id without a version for searching later.

    :id_value: ID of some entity with or without versions.
    """
    version = ""
    if id_value.endswith(":"):
        _id = id_value[:-1]
    elif id_value.split(":")[-1].isdigit():
        version = str(id_value.split(":")[-1])
        _id = id_value[: -len(version) - 1]
    else:
        _id = id_value

    return EntityId(_id, version)


class ValidationIntegrityError(Exception):
    """Raise when an entity does not pass validation integrity."""


class EmptyManifestError(Exception):
    """Raise when manifest field is empty."""


class ManifestIntegrity:
    """Class to validate if parents reference and master data are exists and
    remove non-valid entities to provide integrity
    """

    REFERENCE_DATA_ID_PATTERN = re.compile(
        r"(?<=\")[\w\-\.]+:reference-data\-\-[\w\-\.]+:.[^,;\"]+(?=\")", re.I + re.M
    )
    MASTER_DATA_ID_PATTERN = re.compile(
        r"(?<=\")[\w\-\.]+:master-data\-\-[\w\-\.]+:.[^,;\"]+(?=\")", re.I + re.M
    )
    WORK_PRODUCT_ID_PATTERN = re.compile(
        r"(?<=\")[\w\-\.]+:work-product\-\-[\w\-\.]+:.[^,;\"]+(?=\")", re.I + re.M
    )
    WORK_PRODUCT_COMPONENT_ID_PATTERN = re.compile(
        r"(?<=\")[\w\-\.]+:work-product-component\-\-[\w\-\.]+:.[^,;\"]+(?=\")", re.I + re.M
    )
    DATASET_ID_PATTERN = re.compile(
        r"(?<=\")[\w\-\.]+:dataset\-\-[\w\-\.]+:.[^,;\"]+(?=\")", re.I + re.M
    )
    SURROGATE_KEY_PATTERN = re.compile(r"(?<=\")surrogate-key:[\w\-\.\d]+(?=\")", re.I + re.M)

    def __init__(
        self,
        config: CLIConfig,
        # search_url: str,
        # request_headers: str,
        # context: Context,
    ):
        self.config = config
        # self.search_url = search_url
        # self.request_headers = request_headers
        # self.context = context
        self.ids_for_validation = []
        self.entities_ids = set()
        self.rejected_entities_ids = set()
        self.ref_patterns = [
            self.REFERENCE_DATA_ID_PATTERN,
            self.MASTER_DATA_ID_PATTERN,
            self.WORK_PRODUCT_ID_PATTERN,
            self.WORK_PRODUCT_COMPONENT_ID_PATTERN,
            self.DATASET_ID_PATTERN,
            self.SURROGATE_KEY_PATTERN,
        ]

        super().__init__()

    @staticmethod
    def _match_id_with_pattern(pattern: regex.Pattern, source: str) -> List[str]:
        return pattern.findall(source)

    @staticmethod
    def _collect_ids_by_data_types(manifest_section: dict, data_type: str):
        """
        Collect manifest entities ids by their data types.

        :param manifest_section: A part of the manifest, where data types can be accessed.
        :param data_type: ReferenceData, MasterData etc.
        """
        entities_ids = []
        if manifest_section.get(data_type):
            for elem in manifest_section[data_type]:
                if elem.get("id"):
                    entities_ids.append(elem["id"])
        return entities_ids

    def collect_manifest_entities_ids(self, manifest: dict):
        """
        Collect manifest's entities ids to exclude them while checking integrity.

        :manifest: Manifest.
        """
        entities_ids = []
        for data_type in (REFERENCE_DATA_SECTION, MASTER_DATA_SECTION):
            entities_ids.extend(self._collect_ids_by_data_types(manifest, data_type))

        if manifest.get(DATA_SECTION):
            if manifest[DATA_SECTION].get(WORK_PRODUCT_SECTION, {}).get("id"):
                entities_ids.append(manifest[DATA_SECTION][WORK_PRODUCT_SECTION]["id"])

            for data_type in (WORK_PRODUCT_COMPONENTS_SECTION, DATASETS_SECTION):
                entities_ids.extend(
                    self._collect_ids_by_data_types(manifest[DATA_SECTION], data_type)
                )

        self.entities_ids = set(entities_ids)

    def extract_references(self, entity: dict) -> Set:
        """
        Extract references (everything looking like an ID to another entity) from the entity.

        :param entity: Manifest's entity
        :return: Set of ids to other entities or records.
        """
        manifest_str = json.dumps(entity)
        references = set()
        for pattern in self.ref_patterns:
            references.update(set(self._match_id_with_pattern(pattern, manifest_str)))
        references.discard(entity.get("id"))
        return references

    def extract_external_references(self, entity: dict, entity_references: set) -> List[EntityId]:
        """

        Extract external reference ids from an entity. These references are supposed
        to be searchable via Search service.

        :param entity: Manifest's entity.
        :param entity_references: All entity's references.
        :return: Set of external references.
        """
        entity_references = [split_id(ref) for ref in entity_references]
        external_references = [e for e in entity_references if e.id not in self.entities_ids]
        logger.debug(
            "Entity id: %s, kind %s. External reference ids: %s",
            entity.get("id"),
            entity.get("kind"),
            external_references,
        )

        return external_references

    def find_missing_external_ids(self, external_references: List[EntityId]) -> Set[str]:
        """
        Find absent external references in the system and searchable

        :param external_references: Records IDs are supposed to be found in Search.
        :return: Set of not found references via Search.
        """
        missing_ids = set()
        external_references_without_version = [e.id for e in external_references]

        # Search can't work with ids with versions. So get only ids without versions.
        # TO DO: Move ExtendedSearchId() to the class attribute.
        #  Refactor ExtendedSearchId().search_records() to take records to search
        # search_handler = ExtendedSearchId(
        #     self.search_url, external_references_without_version, self.request_headers, self.context
        # )
        # found_ids = search_handler.search_records()
        batch_size = 200
        found = []
        not_found = []
        batch_verify(
            self.config,
            batch_size,
            external_references_without_version,
            found,
            not_found,
            True,
        )

        for entity_id in external_references:
            # As found_ids contains ids with versions and bare ids, and if entity_id is an id
            # with no version (refers to the last version), we use just the bare id.
            entity_srn = entity_id.srn if entity_id.version else entity_id.id
            if entity_srn not in found:
                missing_ids.add(entity_id.srn)

        return missing_ids

    def _validate_referential_integrity(self, entity: dict):
        """
        Check if a manifest's entity passes referential integrity.

        :param entity: Manifest's entity.
        """
        missing_ids = set()
        references = self.extract_references(entity)
        external_references = self.extract_external_references(entity, references)
        if external_references:
            missing_external_ids = self.find_missing_external_ids(external_references)
            if missing_external_ids:
                missing_ids.update(missing_external_ids)

        if missing_ids:
            logger.warning(
                "Resource with kind %s and id: '%s' was rejected. Missing ids '%s'",
                entity.get("kind"),
                entity.get("id"),
                missing_ids,
            )
            raise ValidationIntegrityError(entity, reason=f"Missing referential id: {missing_ids}")

    def _ensure_manifest_entity_integrity(
        self, manifest_section: List[dict]
    ) -> Tuple[List[dict], List[dict]]:
        """
        Ensure integrity of entities in given manifest parts. If records don't pass this validation
        they are deleted from the manifest.

        :param manifest_section: A part of the manifest, where data types can be accessed.
        :return: List of valid entities and list of invalid entities.
        """
        valid_entities = []
        skipped_entities = []
        for entity in manifest_section:
            try:
                self._validate_referential_integrity(entity)
            except ValidationIntegrityError:  # as error:
                # skipped_entities.append(error.skipped_entity)
                pass
            else:
                valid_entities.append(entity)
        return valid_entities, skipped_entities

    def _ensure_work_product_entity_integrity(self, work_product: dict) -> Tuple[dict, List[dict]]:
        """
        Ensure integrity of entities in given manifest parts. If records don't pass this validation
        they are deleted from the manifest.

        :param work_product: A part of the manifest, where data types can be accessed.
        :return: The work product if it is valid, otherwise, empty dict.
        """
        try:
            self._validate_referential_integrity(work_product)
            return work_product, []
        except ValidationIntegrityError:  # as error:
            return {}, []  # error.skipped_entity]

    @staticmethod
    def _ensure_wpc_artefacts_integrity(wpc: dict):
        artefacts = wpc["data"].get("Artefacts")
        if not artefacts:
            logger.debug("WPC: %s doesn't have Artefacts field. Mark it as valid.", wpc.get("id"))
            return
        artefacts_resource_ids = set(artefact["ResourceID"] for artefact in artefacts)
        datasets = set(wpc["data"].get(DATASETS_SECTION, []))
        duplicated_ids = artefacts_resource_ids.intersection(datasets)
        if duplicated_ids:
            logger.warning(
                "Resource kind '%s' and id '%s' was rejected. "
                "The WPC's Artefacts field contains the same ids as in "
                "the WPC's 'Datasets': %s.",
                wpc.get("kind"),
                wpc.get("id", ""),
                duplicated_ids,
            )
            raise ValidationIntegrityError(
                wpc, reason=f"It has duplicated " f"Datasets and Artefacts: {duplicated_ids}."
            )

    def _ensure_artefacts_integrity(
        self, work_product_components: list
    ) -> Tuple[List[dict], List[dict]]:
        """
        Delete a WPC entity if it didn't passed artefacts integrity check.

        :param work_product_components:
        :return: List of valid wpcs.
        """
        valid_work_product_components = []
        skipped_ids = []
        for wpc in work_product_components:
            try:
                self._ensure_wpc_artefacts_integrity(wpc)
            except ValidationIntegrityError:  # as error:
                pass
                # skipped_ids.append(error.skipped_entity)
            else:
                valid_work_product_components.append(wpc)
        return valid_work_product_components, skipped_ids

    def ensure_integrity(self, manifest: dict = None) -> Tuple[dict, List[dict]]:
        """
        Validate reference ids in traversal manifest file

        :return: only valid entities in the same structure
        """
        skipped_entities = []

        if not manifest:
            raise EmptyManifestError()

        self.collect_manifest_entities_ids(manifest)

        for data_type in (REFERENCE_DATA_SECTION, MASTER_DATA_SECTION):
            if manifest.get(data_type):
                valid_entities, not_valid_entities = self._ensure_manifest_entity_integrity(
                    manifest[data_type]
                )
                manifest[data_type] = valid_entities
                skipped_entities.extend(not_valid_entities)

        if manifest.get(DATA_SECTION):
            if manifest[DATA_SECTION].get(DATASETS_SECTION):
                datasets = manifest[DATA_SECTION].get(DATASETS_SECTION)
                valid_entities, not_valid_entities = self._ensure_manifest_entity_integrity(
                    datasets
                )
                manifest[DATA_SECTION][DATASETS_SECTION] = valid_entities
                skipped_entities.extend(not_valid_entities)

            if manifest[DATA_SECTION].get(WORK_PRODUCT_COMPONENTS_SECTION):
                work_product_components = manifest[DATA_SECTION][WORK_PRODUCT_COMPONENTS_SECTION]
                valid_entities, not_valid_entities = self._ensure_manifest_entity_integrity(
                    work_product_components
                )
                skipped_entities.extend(not_valid_entities)
                valid_entities, not_valid_entities = self._ensure_artefacts_integrity(
                    valid_entities
                )
                manifest[DATA_SECTION][WORK_PRODUCT_COMPONENTS_SECTION] = valid_entities
                skipped_entities.extend(not_valid_entities)

            if manifest[DATA_SECTION].get(WORK_PRODUCT_SECTION):
                work_product_data = manifest[DATA_SECTION][WORK_PRODUCT_SECTION]
                valid_entities, not_valid_entities = self._ensure_work_product_entity_integrity(
                    work_product_data
                )
                manifest[DATA_SECTION][WORK_PRODUCT_SECTION] = valid_entities
                skipped_entities.extend(not_valid_entities)

        return manifest, skipped_entities


def get_missing_ids(manifest_integrity: ManifestIntegrity, entity: dict) -> set():
    """
    Check if a manifest's entity passes referential integrity.

    :param entity: Manifest's entity.
    """
    missing_ids = set()
    references = manifest_integrity.extract_references(entity)
    external_references = manifest_integrity.extract_external_references(entity, references)
    if external_references:
        missing_external_ids = manifest_integrity.find_missing_external_ids(external_references)
        if missing_external_ids:
            missing_ids.update(missing_external_ids)
    return missing_ids

    # """
    # missing_ids = set()
    # references = manifest_integrity._extract_references(entity)
    # external_references = manifest_integrity._extract_external_references(entity, references)

    # # scan for new ids not in cache
    # external_references_to_check = [e for e in external_references if e.id not in cache_checked_ids]
    # if external_references_to_check:
    #     missing_external_ids = manifest_integrity._find_missing_external_ids(external_references_to_check)
    #     if missing_external_ids:
    #         missing_ids.update(missing_external_ids)
    #         cache_missing_ids.update(missing_external_ids)

    # # add missing ids already found from cache
    # missing_ids.update([e.id for e in external_references if e.id in cache_missing_ids])

    # cache_checked_ids.update([e.id for e in external_references])   # cache id's that we have checked.
    # return missing_ids


def get_manifest_entity_missing_ids(
    manifest_integrity, manifest_section: List[dict]
) -> Tuple[List[dict], List[dict], List[str]]:
    """
    Ensure integrity of entities in given manifest parts. If records don't pass this validation
    they are deleted from the manifest.

    :param manifest_section: A part of the manifest, where data types can be accessed.
    :return: List of valid entities and list of invalid entities.
    """
    valid_entities = []
    invalid_entities = []
    missing_ids = set()
    for entity in manifest_section:
        temp_missing_ids = get_missing_ids(manifest_integrity, entity)
        if temp_missing_ids:
            invalid_entities.append(entity)
            missing_ids.update(temp_missing_ids)
        else:
            valid_entities.append(entity)
    return valid_entities, invalid_entities, missing_ids


def get_work_product_missing_ids(
    manifest_integrity, work_product: dict
) -> Tuple[dict, List[dict], List[str]]:
    """
    Ensure integrity of entities in given manifest parts. If records don't pass this validation
    they are deleted from the manifest.

    :param work_product: A part of the manifest, where data types can be accessed.
    :return: The work product if it is valid, otherwise, empty dict.
    """
    missing_ids = get_missing_ids(manifest_integrity, work_product)
    if len(missing_ids) > 0:
        return work_product, [], missing_ids

    return {}, work_product, missing_ids


def create_wpc_artefacts_missing_ids(wpc: dict):
    """Add docstring"""
    artefacts = wpc["data"].get("Artefacts")
    if not artefacts:
        logger.debug("WPC: %s doesn't have Artefacts field. Mark it as valid.", wpc.get("id"))
        return
    artefacts_resource_ids = set(artefact["ResourceID"] for artefact in artefacts)
    datasets = set(wpc["data"].get(DATASETS_SECTION, []))
    duplicated_ids = artefacts_resource_ids.intersection(datasets)
    if duplicated_ids:
        logger.warning(
            "Resource kind '%s' and id '%s' was rejected. "
            "The WPC's Artefacts field contains the same ids as in "
            "the WPC's 'Datasets': %s.",
            wpc.get("kind"),
            wpc.get("id", ""),
            duplicated_ids,
        )
        raise ValidationIntegrityError(
            wpc, reason=f"It has duplicated Datasets and Artefacts: {duplicated_ids}."
        )


def get_artefacts_missing_ids(
    work_product_components: list,
) -> Tuple[List[dict], List[dict], List[str]]:
    """
    Delete a WPC entity if it didn't passed artefacts integrity check.

    :param work_product_components:
    :return: List of valid wpcs.
    """
    valid_work_product_components = []
    skipped_ids = []
    for wpc in work_product_components:
        try:
            create_wpc_artefacts_missing_ids(wpc)
        except ValidationIntegrityError:
            skipped_ids.append(wpc)
        else:
            valid_work_product_components.append(wpc)
    return valid_work_product_components, skipped_ids, []


def get_missing_ids_from_manifest(
    config: CLIConfig, manifest: dict = None
) -> Tuple[dict, List[dict], Set[str]]:
    """
    Get missing reference ids in traversal manifest file

    :return: only valid entities in the same structure
    """
    if not manifest:
        raise EmptyManifestError()

    invalid_entities = []
    missing_ids = set()

    manifest_integrity = ManifestIntegrity(config)

    manifest_integrity.collect_manifest_entities_ids(manifest)

    for data_type in (REFERENCE_DATA_SECTION, MASTER_DATA_SECTION):
        if manifest.get(data_type):
            valid_entities, not_valid_entities, tmp_missing_ids = get_manifest_entity_missing_ids(
                manifest_integrity, manifest[data_type]
            )
            manifest[data_type] = valid_entities
            invalid_entities.extend(not_valid_entities)
            missing_ids.update(tmp_missing_ids)

    if manifest.get(DATA_SECTION):
        if manifest[DATA_SECTION].get(DATASETS_SECTION):
            datasets = manifest[DATA_SECTION].get(DATASETS_SECTION)
            valid_entities, not_valid_entities, tmp_missing_ids = get_manifest_entity_missing_ids(
                manifest_integrity, datasets
            )
            manifest[DATA_SECTION][DATASETS_SECTION] = valid_entities
            invalid_entities.extend(not_valid_entities)
            missing_ids.update(tmp_missing_ids)

        if manifest[DATA_SECTION].get(WORK_PRODUCT_COMPONENTS_SECTION):
            work_product_components = manifest[DATA_SECTION][WORK_PRODUCT_COMPONENTS_SECTION]
            valid_entities, not_valid_entities, tmp_missing_ids = get_manifest_entity_missing_ids(
                manifest_integrity, work_product_components
            )
            invalid_entities.extend(not_valid_entities)
            missing_ids.update(tmp_missing_ids)
            valid_entities, not_valid_entities, tmp_missing_ids = get_artefacts_missing_ids(
                valid_entities
            )
            manifest[DATA_SECTION][WORK_PRODUCT_COMPONENTS_SECTION] = valid_entities
            invalid_entities.extend(not_valid_entities)
            missing_ids.update(tmp_missing_ids)

        if manifest[DATA_SECTION].get(WORK_PRODUCT_SECTION):
            work_product_data = manifest[DATA_SECTION][WORK_PRODUCT_SECTION]
            valid_entities, not_valid_entities, tmp_missing_ids = get_work_product_missing_ids(
                manifest_integrity, work_product_data
            )
            manifest[DATA_SECTION][WORK_PRODUCT_SECTION] = valid_entities
            invalid_entities.extend(not_valid_entities)
            missing_ids.update(tmp_missing_ids)

    return manifest, invalid_entities, missing_ids


def check_references(  # noqa: C901 pylint: disable=R0912
    state: State,
    path: str,
    manifest_output_dir: str = None,
    authority: str = "osdu",
    acl_owners: str = None,
    acl_viewers: str = None,
    legal_tags: str = None,  # , batch_size: int, batch_across_files: bool
) -> dict:
    """Find any referenced id's that are missing in OSDU.

    Args:
        state (State): Global state
        path (str): Path to a file containing run ids to get status of

    Returns:
        dict: Response from service
    """
    if legal_tags is None:
        legal_tags = state.config.get("core", CONFIG_LEGAL_TAG)
    if acl_owners is None:
        acl_owners = state.config.get("core", CONFIG_ACL_OWNER)
    if acl_viewers is None:
        acl_viewers = state.config.get("core", CONFIG_ACL_VIEWER)

    files = get_files_from_path(path)
    logger.debug("Files list: %s", files)

    missing_ids = set()
    for filepath in files:
        if filepath.endswith(".json"):
            with open(filepath) as file:
                data_object = json.load(file)

                logger.info("Processing file %s.", filepath)

                if not data_object:
                    logger.error("Error with file %s. File is empty.", filepath)

                _, _, tmp_missing_ids = get_missing_ids_from_manifest(state.config, data_object)
                logger.info("%i missing id's %s", len(tmp_missing_ids), tmp_missing_ids)
                missing_ids.update(tmp_missing_ids)

    # display missing ids
    if len(missing_ids) > 0:
        logger.info("Total %i missing id's found", len(missing_ids))
        for _id in missing_ids:
            # Create missing entries
            if not manifest_output_dir:
                logger.info("Missing %s", _id)
            else:
                logger.info("Creating %s", _id)
                parts = _id.split(":")
                assert len(parts) == 4
                data_partition = parts[0]
                kind = parts[1]
                code = parts[2]
                # version = parts[3]

                if kind.startswith("reference-data"):
                    create_reference_data_manifest(
                        data_partition,
                        kind,
                        code,
                        authority,
                        acl_owners,
                        acl_viewers,
                        legal_tags,
                        manifest_output_dir,
                    )
                elif kind.startswith("master-data"):
                    create_master_data_manifest(
                        data_partition,
                        kind,
                        code,
                        authority,
                        acl_owners,
                        acl_viewers,
                        legal_tags,
                        manifest_output_dir,
                    )
                else:
                    logger.warning("Unknown type %s. Not created!", _id)
        if manifest_output_dir:
            logger.info(
                "Manifest files created. You should manually edit these"
                " before uploading including any additional fields."
            )
            logger.info(
                "Check also that missing references aren't already included as part of the"
                " scanned data (this check is currently not implemented)!"
            )
    else:
        logger.info("No missing id's found")
