# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 deepset GmbH. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Tuple, Dict, Optional

from pipelines.nodes.base import BaseComponent
from pipelines.schema import Document


class RouteDocuments(BaseComponent):
    """
    A node to split a list of `Document`s by `content_type` or by the values of a metadata field and route them to
    different nodes.
    """

    # By default (split_by == "content_type"), the node has two outgoing edges.
    outgoing_edges = 2

    def __init__(self, split_by: str = "content_type", metadata_values: Optional[List[str]] = None):
        """
        :param split_by: Field to split the documents by, either `"content_type"` or a metadata field name.
            If this parameter is set to `"content_type"`, the list of `Document`s will be split into a list containing
            only `Document`s of type `"text"` (will be routed to `"output_1"`) and a list containing only `Document`s of
            type `"text"` (will be routed to `"output_2"`).
            If this parameter is set to a metadata field name, you need to specify the parameter `metadata_values` as
            well.
        :param metadata_values: If the parameter `split_by` is set to a metadata field name, you need to provide a list
            of values to group the `Document`s to. `Document`s whose metadata field is equal to the first value of the
            provided list will be routed to `"output_1"`, `Document`s whose metadata field is equal to the second
            value of the provided list will be routed to `"output_2"`, etc.
        """

        assert split_by == "content_type" or metadata_values is not None, (
            "If split_by is set to the name of a metadata field, you must provide metadata_values "
            "to group the documents to."
        )

        # Save init parameters to enable export of component config as YAML
        self.set_config(split_by=split_by, metadata_values=metadata_values)

        self.split_by = split_by
        self.metadata_values = metadata_values

        # If we split list of Documents by a metadata field, number of outgoing edges might change
        if split_by != "content_type" and metadata_values is not None:
            self.outgoing_edges = len(metadata_values)

    def run(self, documents: List[Document]) -> Tuple[Dict, str]:  # type: ignore
        if self.split_by == "content_type":
            split_documents: Dict[str, List[Document]] = {"output_1": [], "output_2": []}

            for doc in documents:
                if doc.content_type == "text":
                    split_documents["output_1"].append(doc)
                elif doc.content_type == "table":
                    split_documents["output_2"].append(doc)

        else:
            assert isinstance(self.metadata_values, list), (
                "You need to provide metadata_values if you want to split" " a list of Documents by a metadata field."
            )
            split_documents = {f"output_{i+1}": [] for i in range(len(self.metadata_values))}
            for doc in documents:
                current_metadata_value = doc.meta.get(self.split_by, None)
                # Disregard current document if it does not contain the provided metadata field
                if current_metadata_value is not None:
                    try:
                        index = self.metadata_values.index(current_metadata_value)
                    except ValueError:
                        # Disregard current document if current_metadata_value is not in the provided metadata_values
                        continue

                    split_documents[f"output_{index+1}"].append(doc)

        return split_documents, "split_documents"
