Source code for taxonomy4good.sustainabilityTaxonomy

from .errors import IDNotFoundError, EmptyTaxonomyError, FileTypeNotSupportedError
from .sustainabilityItem import SustainabilityItem
import pandas as pd
import numpy as np
import ast
import json
import os

BUILTIN_TAXONOMIES = ["eu_taxonomy", "ftse_fsgi", "un_sdg", "world_bank_taxonomy",
                      "china_taxonomy", "esg_taxonomy", "en_master_lexicon", "un_sdg_taxonomy"]

TAXONOMIES_DESC = {"eu_taxonomy": "EU Taxonomy",
                   "ftse_fsgi": "FTSE for Social Good Index",
                   "un_sdg": "UN SDGs",
                   "world_bank_taxonomy": "World Bank Taxonomy",
                   "china_taxonomy": "China Taxonomy",
                   "esg_taxonomy": "ESG Taxonomy",
                   "en_master_lexicon": "Full Sustainability Lexicon",
                   "un_sdg_taxonomy": "UN SDGs"}


[docs]class SustainabilityTaxonomy:
    """This object is used to create different taxonomies based on required
    standards. Be it for internal reporting or external, various combinations of
    sustainability related words can be created by grouping those words under
    different categories (words), allowing for the creation of multiple reporting
    items with the needed granularity.

    Feel free to play around with the provided Sustainability Lexicon to create
    your own Taxonomy and make sure you are not missing any word.
    """

    def __init__(self, root=None,
                 version_name='Standard Taxonomy',
                 version_num='0.1.0'):
        if root is None:

            # default: ESG Taxonomy
            full_lexicon = from_file(filepath="esg_taxonomy",
                                     version_name=TAXONOMIES_DESC["esg_taxonomy"],
                                     version_num=version_num,
                                     filetype='excel',
                                     meta=True)
            self.root = full_lexicon.root
            self.version_name = full_lexicon.version_name
            self.version_num = full_lexicon.version_num
        else:
            self.root = root
            self.version_name = version_name
            self.version_num = version_num

[docs]    def insert_items(self, items):
        """ Insert additional items (terms/lexicons) to this existing taxonomy4good

        :param items: terms to add in the taxonomy4good with their respective information
        :type items: list of SustainabilityItem
        """
        if items is not None:

            # make sure input is treated as a list
            if not isinstance(items, list):
                items = [items]

            # get parents and ids of all items to be inserted
            parent_ids = [item.parent.id for item in items]

            parent_ids = list(set(parent_ids))
            # get parent items from ids
            parents = np.array(self.search_by_id(parent_ids))

            for item in items:
                idx = np.where(parents == item.parent)[0][0]

                # if parent has no children, create a list with the respective child
                if parents[idx].children is None:
                    parents[idx].children = [item]
                else:
                    parents[idx].children.append(item)

    # TODO: fix big in remove function
[docs]    def remove_subtree(self, items=None):
        """Remove the passed items along with their children from the taxonomy4good

        :param items: the items of subtrees/substructures to be removed
        :type items: list of SustainabilityItems
        """

        if not isinstance(items, list) and not isinstance(items, np.ndarray):
            items = [items]
        # every supplied item
        for item in items:

            # if item is not a leaf node, perform this function on children first
            if item.children is not None:
                self.remove_subtree(item.children)

            # update the parent item
            if item.parent is not None:
                item.parent.children.remove(item)

            del item

[docs]    def remove_by_id(self, ids):
        """Remove from the taxonomy4good items corresponding to the supplied ids

        :param ids: ids corresponding to the items to be removed from the taxonomy4good
        :type ids: int | list of int
        """

        # get items corresponding to the ids
        items = self.search_by_id(ids)

        # remove items from taxonomy4good
        self.remove_subtree(items)

[docs]    def get_items_each_level(self, start_root=None):
        """Get lists of items for each level of the taxonomy4good (grouped by level)

        :param start_root: starting node of subtree (default: root of taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: SustainabilityItem list for each level
        :rtype: numpy.array
        """

        # if no root is specified, set the root of the taxonomy4good as starting root
        if start_root is None:
            start_root = self.root

        # these will help iterate over the levels of the taxonomy4good
        current_level = 0
        current_items = np.array([start_root])
        items = []
        next_level_items = np.array([])

        # while we did not reach the final level
        while current_level < self.level(start_root):
            next_level_items = np.array([])

            for ci in current_items:
                # specify next level items if current item is not leaf item
                if ci.children is not None:
                    next_level_items = np.concatenate([next_level_items, ci.children])

            # update the state of the iteration step and update the current items to list
            items.append(np.array([ci for ci in current_items]))
            current_level += 1
            current_items = next_level_items

        return np.array(items, dtype=object)

[docs]    def get_level_items(self, level):
        """Get items of the specified level

        :param level: desired level of the taxonomy4good we wish to extract items from
        :type level: int
        :returns: list of items in the specified level
        :rtype: numpy.array
        """

        return self.get_items_each_level(self.root)[level]

[docs]    def get_items(self, start_root=None):
        """Get all the items of the structure

        :param start_root: root item of the desired structure or substructure we wish
                           to get items from (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: all the items of the taxonomy4good
        :rtype: numpy.array (SustainabilityItem)
        """

        # if no root is specified, set the root of the taxonomy4good as starting root
        if start_root is None:
            if self.root is None:
                return np.array([])
            start_root = self.root

        return np.concatenate(self.get_items_each_level(start_root))

[docs]    def get_terms(self, start_root=None):
        """Get all terms (names/lexicon) in the taxonomy4good

        :param start_root: root item of the desired structure or substructure we wish
                           to get terms from (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: all the terms of the taxonomy4good
        :rtype: numpy.array (str)
        """

        # extract all items first, then return the name attributes
        items = self.get_items(start_root)
        return [item.name for item in items]

[docs]    def get_all_ids(self, start_root=None):
        """Get ids of all the nodes in the current taxonomy4good (grouped by level)

        :param start_root: root item of the desired structure or substructure we wish
                           to get ids from (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: all the terms of the taxonomy4good
        :rtype: numpy.array (int)
        """

        # if no root is specified, set the root of the taxonomy4good as starting root
        if start_root is None:
            start_root = self.root
        items = self.get_items_each_level(start_root)
        ids = []
        for level in range(len(items)):
            ids.append([item.id for item in items[level]])

        return np.array(ids, dtype=object)

[docs]    def search_by_id(self, ids):
        """Search for items by their id

        :param ids: list of ids of the nodes to look for
        :type ids: list int
        :returns: items having the supplied ids
        :rtype: list of SustainabilityItem objects
        """
        if isinstance(ids, int):
            ids = [ids]
        # get the ids of current taxonomy4good nodes
        node_ids = np.concatenate(self.get_all_ids().flatten())

        # check if all ids exist in the taxonomy4good
        if not set(ids).issubset(node_ids):
            raise IDNotFoundError(f"{set(ids).difference(node_ids)}"
                                  + " not found in the Taxonomy")

        # get all items in taxonomy4good
        sustainability_items = self.get_items()

        # get items with the corresponding ids
        idx = np.concatenate([np.where(node_ids == id) for id in ids])

        return list(sustainability_items[[idx]].flatten())

[docs]    def level(self, start_item=None):
        """ Compute the maximum depth/level of the taxonomy4good

        :param start_item: root item of the desired structure or substructure we wish
                           to compute the depth/level
        :type start_item: SustainabilityItem
        :returns: level of the taxonomy4good
        :rtype: int
        """

        if self.root is None:
            return 0

        if start_item is None:
            start_item = self.root

        # if root has no children, number of levels is 1
        if start_item.children is None:
            return 1

        # if current item has children, get level of children subtrees
        for item in start_item.children:
            lvl = self.level(item) + 1

        # get the maximum level of children subtrees and add 1 for current root
        return max(lvl) if isinstance(lvl, list) else lvl

[docs]    def to_csv(self, filepath, start_root=None):
        """Save current taxonomy4good/substructure to a csv file

        :param filepath: path where to save the resulting file
        :type filepath: str
        :param start_root: root item of the structure or substructure to be saved as
                          csv (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        """
        if start_root is None:
            start_root = self.root

        items_df = self.to_dataframe(start_root)
        items_df.to_csv(f"{filepath}.csv")

[docs]    def to_excel(self, filepath, start_root=None):
        """Save current taxonomy4good/substructure to an Excel file

        :param filepath: path where to save the resulting file
        :type filepath: str
        :param start_root: root item of the structure or substructure to be saved as
                          Excel (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        """

        if start_root is None:
            start_root = self.root

        items_df = self.to_dataframe(start_root)
        items_df.to_excel(f"{filepath}.xlsx")

[docs]    def items_to_json(self, filepath, start_root=None):
        """Save current taxonomy4good/substructure items to a JSON file (records structure)

        :param filepath: path where to save the resulting file
        :type filepath: str
        :param start_root: root item of the structure or substructure to be saved as
                          JSON (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        """

        # If no substructure root is specified, take the root of the overall structure
        if start_root is None:
            start_root = self.root

        items_df = self.to_dataframe(start_root)
        items_df.to_json(f"{filepath}.json", orient='records')

[docs]    def taxonomy_to_json(self, filepath, start_root=None):
        """Save current taxonomy4good/substructure items to a JSON file (hierarchical structure)

        :param filepath: path where to save the resulting file
        :type filepath: str
        :param start_root: root item of the structure or substructure to be saved as
                          JSON (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        """

        if start_root is None:
            start_root = self.root
        # convert the current taxonomy4good to a dictionary
        taxonomy_dict = self.taxonomy_to_dict(start_root)

        # save resulting dictionary to a json file
        with open(f"{filepath}.json", "w") as f:
            json.dump(taxonomy_dict, f, indent=4)

[docs]    def print_hierarchy(self, start_item=None, current_level=0, islast=False):
        """Print the current hierarchy of the taxonomy4good with the respective values

        :param start_item: starting root of the taxonomy4good/substructure we wish
                            to start from
        :type start_item: SustainabilityItem
        :param current_level: indicating the current level that is being printed
        :type current_level: int
        :param islast: indicating of item is last in the list of children
        :type islast: bool
        """

        # if taxonomy4good is empty, raise error
        if self.root is None:
            raise EmptyTaxonomyError("Taxonomy is empty")

        else:
            # if not substructure root is specified, use the entire taxonomy4good
            if start_item is None:
                start_item = self.root

            self.compute_scores(start_item, False)
            # print root
            if current_level == 0:
                print(f"{start_item.name} : {start_item.score}")
                print("│\n│")
            else:

                # specify the printing structure according to current level
                if current_level == 1:
                    if islast:
                        sep = "└"
                    else:
                        sep = "├"
                    print(sep + "─────" + str(start_item.name) + " : " + str(start_item.score))
                else:
                    if islast:
                        s = " "
                    else:
                        s = "│"
                    print(s + (current_level - 1) * "       " + "└───── " + str(start_item.name)
                          + " : " + str(start_item.score))

            # update level status
            current_level += 1
            if start_item.children is not None:
                for idx in range(len(start_item.children)):
                    if current_level == 1 and idx == len(start_item.children) - 1:
                        islast = True

                    # run function again on children by passing level status
                    self.print_hierarchy(start_item.children[idx], current_level, islast)

[docs]    def get_level_scores(self, level):
        """Compute the weighted values/scores for the specified level

        :param level: taxonomy4good level
        :type level: int
        :returns: names of level items and their respective weighted values
        :rtype: dict
        """

        # compute scores for the entire taxonomy4good (bottom up)
        self.compute_scores(self.root, False)

        # get items in the specified level
        level_items = self.get_level_items(level)

        # create the desired data structure (item name : value)
        level_scores = {item.name: item.score for item in level_items}

        return level_scores

[docs]    def compute_scores(self, start_root=None, root_score=True):
        """Compute the weighted scores for the entire taxonomy4good

        :param root_score: decide whether to return the score of the root, default is true
        :type root_score: bool
        :param start_root: root of taxonomy4good/substructure for which we want to compute
                            the score (default: root of the entire taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: the weighted value/score of the root node (start_root)
        :rtype: float
        """
        # compute the weighted scores from the attributes up to the root
        score = 0
        if start_root is None:
            if self.root is None:
                raise EmptyTaxonomyError("Taxonomy is empty")

            # otherwise set start root as the root of the overall taxonomy4good
            start_root = self.root

        # return weighted score if current item is leaf node
        if start_root.children is None:
            return start_root.score * start_root.weight

        # compute the weighted score for all the children of current item
        for child in start_root.children:
            score += self.compute_scores(child)

        # update the value by the current weighted value
        start_root.score = score
        if root_score:
            return score

[docs]    def summary(self):
        """Print the general information about the entire taxonomy4good"""

        if self.root is None:
            print("The taxonomy4good is empty")
        else:
            print(f"Number of Sustainability items: {self.get_items().size}")
            root_score = self.compute_scores(self.root, True)
            print(f"Overall weighted score: {root_score}")
            print(f"Number of levels : {self.level()}")

            if self.root.children is not None:
                top_level_name = [child.name for child in self.root.children]
                print(f"Top level items are {top_level_name}")
                print(f"Top level items scores: {[item.score for item in self.root.children]}")

[docs]    def to_dataframe(self, start_root=None):
        """Convert the entire taxonomy4good to a DataFrame

        :param start_root: the root item of the taxonomy4good/substructure to be converted
                          to a DataFrame (default: root of the overall taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: a dataframe version of the taxonomy4good
        :rtype: pd.DataFrame"""

        if start_root is None:
            start_root = self.root

        # convert taxonomy4good to a dictionary first
        items = self.items_to_dict(start_root)
        return pd.DataFrame(items)

[docs]    def similar_items(self, sustainability_items):
        """Gives the items under the same parent

        :param sustainability_items: list of items which items under the same parent
                                     are returned
        :type sustainability_items: list of SustainabilityItem
        :returns: list of child items under the parents of the specified items
        :rtype: list of SustainabilityItem lists"""

        # if input is a single item, return directly the children of its parent
        if not isinstance(sustainability_items, list):
            return sustainability_items.parent.children

        # check if the items have a parent (check if items are not roots)
        sustainability_items = [item for item in sustainability_items
                                if item.parent is not None]

        # get parent items
        parents = [item.parent for item in sustainability_items]
        parents = list(set(parents))

        # get the children from the resulting parents
        similar_items = [p.children for p in parents]

        return similar_items

[docs]    def similar_items_byid(self, ids):
        """Gives the items under the same parent as items having the specified ids

        :param ids: list of ids which items under the same parent of the items having
                    the specified ids are returned
        :type ids: list of int
        :returns: list of child items under the parents of the specified items
        :rtype: list of SustainabilityItem lists"""

        sustainability_items = self.search_by_id(ids)
        if len(sustainability_items) == 1:
            sustainability_items = sustainability_items[0]
        return self.similar_items(sustainability_items)

[docs]    def search_items_by_name(self, terms, start_root=None):
        """Look for similar SustainabilityItems using a string partial match

        :param terms: list of terms/names to search for
        :type terms: list of str
        :param start_root: the root item of the taxonomy4good/substructured to be searched
                          from (default: root of the overall taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: items having the name attributes partially similar to terms
        :rtype: numpy.array of SustainabilityItems
        """

        if start_root is None:
            start_root = self.root

        if not isinstance(terms, list):
            terms = [terms]

        # get all items start from start_root
        items = self.get_items(start_root)
        items_found = []

        # check if terms are substrings of the name attribute in terms
        for term in terms:
            items_found.append([item for item in items
                                if term.lower() in item.name.lower()])

        if len(items_found) == 1:
            items_found = sum(items_found, [])
        return items_found

[docs]    def search_similar_names(self, terms, start_root=None):
        """Search for similar names/terms in the taxonomy4good using a string partial match

        :param terms: list of terms/names to search for
        :type terms: list of str
        :param start_root: the root item of the taxonomy4good/substructured to be searched
                          from (default: root of the overall taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: terms partially similar to terms
        :rtype: numpy.array of str
        """
        if start_root is None:
            start_root = self.root

        if not isinstance(terms, list):
            terms = [terms]
        # get all items start from start_root
        items = self.get_items(start_root)
        items_found = []

        # check if terms are substrings of the name attribute in terms
        for term in terms:
            items_found.append([item.name for item in items
                                if term.lower() in item.name.lower()])

        if len(items_found) == 1:
            items_found = sum(items_found, [])

        return items_found

[docs]    def items_to_dict(self, start_root=None):
        """Convert the entire taxonomy4good to a dictionary (records) starting from start_root

        :param start_root: the root item of the taxonomy4good/substructured of which items
                          are to be converted to dictionary (default: root of the
                          overall taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: list of dictionary converted items (records)
        :rtype: list of dict
        """
        if start_root is None:
            start_root = self.root

        # get all items in the taxonomy4good starting from start_root
        sustainability_items = self.get_items(start_root)

        # convert each item to a dictionary
        dict_items = [item.to_dict() for item in sustainability_items]
        return dict_items

[docs]    def taxonomy_to_dict(self, start_root=None):
        """Convert the entire taxonomy4good to a dictionary (structural hierarchy)
        starting from start_root

        :param start_root: the root item of the taxonomy4good/substructured to be converted
                          to dictionary (default: root of the overall taxonomy4good)
        :type start_root: SustainabilityItem
        :returns: dictionary version of the taxonomy4good
        :rtype: dict
        """
        if start_root is None:
            start_root = self.root

        if start_root.children is None:
            return [start_root.to_dict()]

        # this makes sure to avoid unnecessary [] if there is only a single child
        if len(start_root.children) == 1:
            dict_builder = self.taxonomy_to_dict(start_root.children[0])
        else:
            dict_builder = [self.taxonomy_to_dict(child) for child in start_root.children]

        root_dict = start_root.to_dict()
        root_dict['children'] = [dict_builder] if isinstance(dict_builder, dict) else dict_builder

        return root_dict


def from_file(filepath, version_name="Standard Taxonomy", version_num="0.1.0", filetype='excel', meta=False):
    """Create a taxonomy from existing file. This can be a builtin taxonomy in taxonomy4good or a newly created one.

    :param filepath: the path of the file describing the structure of taxonomy or the name of builtin taxonomy.
    :type filepath: str
    :param version_name: the name of the taxonomy
    :type version_name: str
    :param version_num: the number of the taxonomy version
    :type version_num: str
    :param filetype: the type of the file (excel or json)
    :type filetype: str
    :param meta: indicating if the file include meta-data
    :type meta: bool

    :returns: create taxonomy from the indicated file
    :rtype: SustainabilityTaxonomy
    """
    root = SustainabilityItem(id=0, name=version_name)

    if filetype == 'excel':
        # if the name corresponds to one of the existing taxonomies, get file from taxonomies directory
        if filepath in BUILTIN_TAXONOMIES:
            root.name = TAXONOMIES_DESC[filepath]
            items_df = pd.read_excel(os.path.dirname(os.path.abspath(__file__)) + "/taxonomies/" + filepath + ".xlsx")
            version_name = TAXONOMIES_DESC[filepath]
        else:
            items_df = pd.read_excel(filepath)
    elif filetype == 'json':
        items_df = pd.read_json(filepath)
    else:
        raise FileTypeNotSupportedError(f"{filetype} is currently not supported")

    items_df.replace({np.nan: None}, inplace=True)

    items = [root]

    # Consider any additional columns as meta-data
    all_columns = items_df.columns
    meta_data_col = [col for col in all_columns
                     if col not in ["id", "name", "level", "grouping",
                                    "parent", "score", "weight", "children"]]

    # create sustainability items
    for item in items_df.to_dict('records'):
        # create item with respective attributes
        if meta:
            meta_dict = {key: item[key] for key in meta_data_col}
        else:
            meta_dict = {}

        sustainability_item = SustainabilityItem(id=item['id'],
                                                 name=item['name'],
                                                 level=item['level'],
                                                 grouping=item['grouping'],
                                                 parent=item['parent'],
                                                 score=item['score'],
                                                 weight=item['weight'],
                                                 children=item['children'],
                                                 meta_data=meta_dict)

        # if parent is not None, update parent value with SustainabilityItem
        # Update parent children
        if item['parent'] is not None:
            parent = items[int(item['parent'])]

            sustainability_item.parent = parent
            if not isinstance(parent.children, list):
                # convert string list to a list
                parent.children = ast.literal_eval(parent.children)

            for i in range(len(parent.children)):

                if sustainability_item.id == parent.children[i]:
                    child_idx = i

            parent.children[child_idx] = sustainability_item

        else:
            if items[0].children is None:
                items[0].children = []
            items[0].children.append(sustainability_item)
            sustainability_item.parent = items[0]

        items.append(sustainability_item)

    return SustainabilityTaxonomy(items[0], version_name, version_num)