from .errors import IDNotFoundError, EmptyTaxonomyError, FileTypeNotSupportedError
from .sustainabilityItem import SustainabilityItem
import pandas as pd
import numpy as np
import ast
import json
import os
BUILTIN_TAXONOMIES = ["eu_taxonomy", "ftse_fsgi", "un_sdg", "world_bank_taxonomy",
"china_taxonomy", "esg_taxonomy", "en_master_lexicon", "un_sdg_taxonomy"]
TAXONOMIES_DESC = {"eu_taxonomy": "EU Taxonomy",
"ftse_fsgi": "FTSE for Social Good Index",
"un_sdg": "UN SDGs",
"world_bank_taxonomy": "World Bank Taxonomy",
"china_taxonomy": "China Taxonomy",
"esg_taxonomy": "ESG Taxonomy",
"en_master_lexicon": "Full Sustainability Lexicon",
"un_sdg_taxonomy": "UN SDGs"}
[docs]class SustainabilityTaxonomy:
"""This object is used to create different taxonomies based on required
standards. Be it for internal reporting or external, various combinations of
sustainability related words can be created by grouping those words under
different categories (words), allowing for the creation of multiple reporting
items with the needed granularity.
Feel free to play around with the provided Sustainability Lexicon to create
your own Taxonomy and make sure you are not missing any word.
"""
def __init__(self, root=None,
version_name='Standard Taxonomy',
version_num='0.1.0'):
if root is None:
# default: ESG Taxonomy
full_lexicon = from_file(filepath="esg_taxonomy",
version_name=TAXONOMIES_DESC["esg_taxonomy"],
version_num=version_num,
filetype='excel',
meta=True)
self.root = full_lexicon.root
self.version_name = full_lexicon.version_name
self.version_num = full_lexicon.version_num
else:
self.root = root
self.version_name = version_name
self.version_num = version_num
[docs] def insert_items(self, items):
""" Insert additional items (terms/lexicons) to this existing taxonomy4good
:param items: terms to add in the taxonomy4good with their respective information
:type items: list of SustainabilityItem
"""
if items is not None:
# make sure input is treated as a list
if not isinstance(items, list):
items = [items]
# get parents and ids of all items to be inserted
parent_ids = [item.parent.id for item in items]
parent_ids = list(set(parent_ids))
# get parent items from ids
parents = np.array(self.search_by_id(parent_ids))
for item in items:
idx = np.where(parents == item.parent)[0][0]
# if parent has no children, create a list with the respective child
if parents[idx].children is None:
parents[idx].children = [item]
else:
parents[idx].children.append(item)
# TODO: fix big in remove function
[docs] def remove_subtree(self, items=None):
"""Remove the passed items along with their children from the taxonomy4good
:param items: the items of subtrees/substructures to be removed
:type items: list of SustainabilityItems
"""
if not isinstance(items, list) and not isinstance(items, np.ndarray):
items = [items]
# every supplied item
for item in items:
# if item is not a leaf node, perform this function on children first
if item.children is not None:
self.remove_subtree(item.children)
# update the parent item
if item.parent is not None:
item.parent.children.remove(item)
del item
[docs] def remove_by_id(self, ids):
"""Remove from the taxonomy4good items corresponding to the supplied ids
:param ids: ids corresponding to the items to be removed from the taxonomy4good
:type ids: int | list of int
"""
# get items corresponding to the ids
items = self.search_by_id(ids)
# remove items from taxonomy4good
self.remove_subtree(items)
[docs] def get_items_each_level(self, start_root=None):
"""Get lists of items for each level of the taxonomy4good (grouped by level)
:param start_root: starting node of subtree (default: root of taxonomy4good)
:type start_root: SustainabilityItem
:returns: SustainabilityItem list for each level
:rtype: numpy.array
"""
# if no root is specified, set the root of the taxonomy4good as starting root
if start_root is None:
start_root = self.root
# these will help iterate over the levels of the taxonomy4good
current_level = 0
current_items = np.array([start_root])
items = []
next_level_items = np.array([])
# while we did not reach the final level
while current_level < self.level(start_root):
next_level_items = np.array([])
for ci in current_items:
# specify next level items if current item is not leaf item
if ci.children is not None:
next_level_items = np.concatenate([next_level_items, ci.children])
# update the state of the iteration step and update the current items to list
items.append(np.array([ci for ci in current_items]))
current_level += 1
current_items = next_level_items
return np.array(items, dtype=object)
[docs] def get_level_items(self, level):
"""Get items of the specified level
:param level: desired level of the taxonomy4good we wish to extract items from
:type level: int
:returns: list of items in the specified level
:rtype: numpy.array
"""
return self.get_items_each_level(self.root)[level]
[docs] def get_items(self, start_root=None):
"""Get all the items of the structure
:param start_root: root item of the desired structure or substructure we wish
to get items from (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
:returns: all the items of the taxonomy4good
:rtype: numpy.array (SustainabilityItem)
"""
# if no root is specified, set the root of the taxonomy4good as starting root
if start_root is None:
if self.root is None:
return np.array([])
start_root = self.root
return np.concatenate(self.get_items_each_level(start_root))
[docs] def get_terms(self, start_root=None):
"""Get all terms (names/lexicon) in the taxonomy4good
:param start_root: root item of the desired structure or substructure we wish
to get terms from (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
:returns: all the terms of the taxonomy4good
:rtype: numpy.array (str)
"""
# extract all items first, then return the name attributes
items = self.get_items(start_root)
return [item.name for item in items]
[docs] def get_all_ids(self, start_root=None):
"""Get ids of all the nodes in the current taxonomy4good (grouped by level)
:param start_root: root item of the desired structure or substructure we wish
to get ids from (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
:returns: all the terms of the taxonomy4good
:rtype: numpy.array (int)
"""
# if no root is specified, set the root of the taxonomy4good as starting root
if start_root is None:
start_root = self.root
items = self.get_items_each_level(start_root)
ids = []
for level in range(len(items)):
ids.append([item.id for item in items[level]])
return np.array(ids, dtype=object)
[docs] def search_by_id(self, ids):
"""Search for items by their id
:param ids: list of ids of the nodes to look for
:type ids: list int
:returns: items having the supplied ids
:rtype: list of SustainabilityItem objects
"""
if isinstance(ids, int):
ids = [ids]
# get the ids of current taxonomy4good nodes
node_ids = np.concatenate(self.get_all_ids().flatten())
# check if all ids exist in the taxonomy4good
if not set(ids).issubset(node_ids):
raise IDNotFoundError(f"{set(ids).difference(node_ids)}"
+ " not found in the Taxonomy")
# get all items in taxonomy4good
sustainability_items = self.get_items()
# get items with the corresponding ids
idx = np.concatenate([np.where(node_ids == id) for id in ids])
return list(sustainability_items[[idx]].flatten())
[docs] def level(self, start_item=None):
""" Compute the maximum depth/level of the taxonomy4good
:param start_item: root item of the desired structure or substructure we wish
to compute the depth/level
:type start_item: SustainabilityItem
:returns: level of the taxonomy4good
:rtype: int
"""
if self.root is None:
return 0
if start_item is None:
start_item = self.root
# if root has no children, number of levels is 1
if start_item.children is None:
return 1
# if current item has children, get level of children subtrees
for item in start_item.children:
lvl = self.level(item) + 1
# get the maximum level of children subtrees and add 1 for current root
return max(lvl) if isinstance(lvl, list) else lvl
[docs] def to_csv(self, filepath, start_root=None):
"""Save current taxonomy4good/substructure to a csv file
:param filepath: path where to save the resulting file
:type filepath: str
:param start_root: root item of the structure or substructure to be saved as
csv (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
"""
if start_root is None:
start_root = self.root
items_df = self.to_dataframe(start_root)
items_df.to_csv(f"{filepath}.csv")
[docs] def to_excel(self, filepath, start_root=None):
"""Save current taxonomy4good/substructure to an Excel file
:param filepath: path where to save the resulting file
:type filepath: str
:param start_root: root item of the structure or substructure to be saved as
Excel (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
"""
if start_root is None:
start_root = self.root
items_df = self.to_dataframe(start_root)
items_df.to_excel(f"{filepath}.xlsx")
[docs] def items_to_json(self, filepath, start_root=None):
"""Save current taxonomy4good/substructure items to a JSON file (records structure)
:param filepath: path where to save the resulting file
:type filepath: str
:param start_root: root item of the structure or substructure to be saved as
JSON (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
"""
# If no substructure root is specified, take the root of the overall structure
if start_root is None:
start_root = self.root
items_df = self.to_dataframe(start_root)
items_df.to_json(f"{filepath}.json", orient='records')
[docs] def taxonomy_to_json(self, filepath, start_root=None):
"""Save current taxonomy4good/substructure items to a JSON file (hierarchical structure)
:param filepath: path where to save the resulting file
:type filepath: str
:param start_root: root item of the structure or substructure to be saved as
JSON (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
"""
if start_root is None:
start_root = self.root
# convert the current taxonomy4good to a dictionary
taxonomy_dict = self.taxonomy_to_dict(start_root)
# save resulting dictionary to a json file
with open(f"{filepath}.json", "w") as f:
json.dump(taxonomy_dict, f, indent=4)
[docs] def print_hierarchy(self, start_item=None, current_level=0, islast=False):
"""Print the current hierarchy of the taxonomy4good with the respective values
:param start_item: starting root of the taxonomy4good/substructure we wish
to start from
:type start_item: SustainabilityItem
:param current_level: indicating the current level that is being printed
:type current_level: int
:param islast: indicating of item is last in the list of children
:type islast: bool
"""
# if taxonomy4good is empty, raise error
if self.root is None:
raise EmptyTaxonomyError("Taxonomy is empty")
else:
# if not substructure root is specified, use the entire taxonomy4good
if start_item is None:
start_item = self.root
self.compute_scores(start_item, False)
# print root
if current_level == 0:
print(f"{start_item.name} : {start_item.score}")
print("│\n│")
else:
# specify the printing structure according to current level
if current_level == 1:
if islast:
sep = "└"
else:
sep = "├"
print(sep + "─────" + str(start_item.name) + " : " + str(start_item.score))
else:
if islast:
s = " "
else:
s = "│"
print(s + (current_level - 1) * " " + "└───── " + str(start_item.name)
+ " : " + str(start_item.score))
# update level status
current_level += 1
if start_item.children is not None:
for idx in range(len(start_item.children)):
if current_level == 1 and idx == len(start_item.children) - 1:
islast = True
# run function again on children by passing level status
self.print_hierarchy(start_item.children[idx], current_level, islast)
[docs] def get_level_scores(self, level):
"""Compute the weighted values/scores for the specified level
:param level: taxonomy4good level
:type level: int
:returns: names of level items and their respective weighted values
:rtype: dict
"""
# compute scores for the entire taxonomy4good (bottom up)
self.compute_scores(self.root, False)
# get items in the specified level
level_items = self.get_level_items(level)
# create the desired data structure (item name : value)
level_scores = {item.name: item.score for item in level_items}
return level_scores
[docs] def compute_scores(self, start_root=None, root_score=True):
"""Compute the weighted scores for the entire taxonomy4good
:param root_score: decide whether to return the score of the root, default is true
:type root_score: bool
:param start_root: root of taxonomy4good/substructure for which we want to compute
the score (default: root of the entire taxonomy4good)
:type start_root: SustainabilityItem
:returns: the weighted value/score of the root node (start_root)
:rtype: float
"""
# compute the weighted scores from the attributes up to the root
score = 0
if start_root is None:
if self.root is None:
raise EmptyTaxonomyError("Taxonomy is empty")
# otherwise set start root as the root of the overall taxonomy4good
start_root = self.root
# return weighted score if current item is leaf node
if start_root.children is None:
return start_root.score * start_root.weight
# compute the weighted score for all the children of current item
for child in start_root.children:
score += self.compute_scores(child)
# update the value by the current weighted value
start_root.score = score
if root_score:
return score
[docs] def summary(self):
"""Print the general information about the entire taxonomy4good"""
if self.root is None:
print("The taxonomy4good is empty")
else:
print(f"Number of Sustainability items: {self.get_items().size}")
root_score = self.compute_scores(self.root, True)
print(f"Overall weighted score: {root_score}")
print(f"Number of levels : {self.level()}")
if self.root.children is not None:
top_level_name = [child.name for child in self.root.children]
print(f"Top level items are {top_level_name}")
print(f"Top level items scores: {[item.score for item in self.root.children]}")
[docs] def to_dataframe(self, start_root=None):
"""Convert the entire taxonomy4good to a DataFrame
:param start_root: the root item of the taxonomy4good/substructure to be converted
to a DataFrame (default: root of the overall taxonomy4good)
:type start_root: SustainabilityItem
:returns: a dataframe version of the taxonomy4good
:rtype: pd.DataFrame"""
if start_root is None:
start_root = self.root
# convert taxonomy4good to a dictionary first
items = self.items_to_dict(start_root)
return pd.DataFrame(items)
[docs] def similar_items(self, sustainability_items):
"""Gives the items under the same parent
:param sustainability_items: list of items which items under the same parent
are returned
:type sustainability_items: list of SustainabilityItem
:returns: list of child items under the parents of the specified items
:rtype: list of SustainabilityItem lists"""
# if input is a single item, return directly the children of its parent
if not isinstance(sustainability_items, list):
return sustainability_items.parent.children
# check if the items have a parent (check if items are not roots)
sustainability_items = [item for item in sustainability_items
if item.parent is not None]
# get parent items
parents = [item.parent for item in sustainability_items]
parents = list(set(parents))
# get the children from the resulting parents
similar_items = [p.children for p in parents]
return similar_items
[docs] def similar_items_byid(self, ids):
"""Gives the items under the same parent as items having the specified ids
:param ids: list of ids which items under the same parent of the items having
the specified ids are returned
:type ids: list of int
:returns: list of child items under the parents of the specified items
:rtype: list of SustainabilityItem lists"""
sustainability_items = self.search_by_id(ids)
if len(sustainability_items) == 1:
sustainability_items = sustainability_items[0]
return self.similar_items(sustainability_items)
[docs] def search_items_by_name(self, terms, start_root=None):
"""Look for similar SustainabilityItems using a string partial match
:param terms: list of terms/names to search for
:type terms: list of str
:param start_root: the root item of the taxonomy4good/substructured to be searched
from (default: root of the overall taxonomy4good)
:type start_root: SustainabilityItem
:returns: items having the name attributes partially similar to terms
:rtype: numpy.array of SustainabilityItems
"""
if start_root is None:
start_root = self.root
if not isinstance(terms, list):
terms = [terms]
# get all items start from start_root
items = self.get_items(start_root)
items_found = []
# check if terms are substrings of the name attribute in terms
for term in terms:
items_found.append([item for item in items
if term.lower() in item.name.lower()])
if len(items_found) == 1:
items_found = sum(items_found, [])
return items_found
[docs] def search_similar_names(self, terms, start_root=None):
"""Search for similar names/terms in the taxonomy4good using a string partial match
:param terms: list of terms/names to search for
:type terms: list of str
:param start_root: the root item of the taxonomy4good/substructured to be searched
from (default: root of the overall taxonomy4good)
:type start_root: SustainabilityItem
:returns: terms partially similar to terms
:rtype: numpy.array of str
"""
if start_root is None:
start_root = self.root
if not isinstance(terms, list):
terms = [terms]
# get all items start from start_root
items = self.get_items(start_root)
items_found = []
# check if terms are substrings of the name attribute in terms
for term in terms:
items_found.append([item.name for item in items
if term.lower() in item.name.lower()])
if len(items_found) == 1:
items_found = sum(items_found, [])
return items_found
[docs] def items_to_dict(self, start_root=None):
"""Convert the entire taxonomy4good to a dictionary (records) starting from start_root
:param start_root: the root item of the taxonomy4good/substructured of which items
are to be converted to dictionary (default: root of the
overall taxonomy4good)
:type start_root: SustainabilityItem
:returns: list of dictionary converted items (records)
:rtype: list of dict
"""
if start_root is None:
start_root = self.root
# get all items in the taxonomy4good starting from start_root
sustainability_items = self.get_items(start_root)
# convert each item to a dictionary
dict_items = [item.to_dict() for item in sustainability_items]
return dict_items
[docs] def taxonomy_to_dict(self, start_root=None):
"""Convert the entire taxonomy4good to a dictionary (structural hierarchy)
starting from start_root
:param start_root: the root item of the taxonomy4good/substructured to be converted
to dictionary (default: root of the overall taxonomy4good)
:type start_root: SustainabilityItem
:returns: dictionary version of the taxonomy4good
:rtype: dict
"""
if start_root is None:
start_root = self.root
if start_root.children is None:
return [start_root.to_dict()]
# this makes sure to avoid unnecessary [] if there is only a single child
if len(start_root.children) == 1:
dict_builder = self.taxonomy_to_dict(start_root.children[0])
else:
dict_builder = [self.taxonomy_to_dict(child) for child in start_root.children]
root_dict = start_root.to_dict()
root_dict['children'] = [dict_builder] if isinstance(dict_builder, dict) else dict_builder
return root_dict
def from_file(filepath, version_name="Standard Taxonomy", version_num="0.1.0", filetype='excel', meta=False):
"""Create a taxonomy from existing file. This can be a builtin taxonomy in taxonomy4good or a newly created one.
:param filepath: the path of the file describing the structure of taxonomy or the name of builtin taxonomy.
:type filepath: str
:param version_name: the name of the taxonomy
:type version_name: str
:param version_num: the number of the taxonomy version
:type version_num: str
:param filetype: the type of the file (excel or json)
:type filetype: str
:param meta: indicating if the file include meta-data
:type meta: bool
:returns: create taxonomy from the indicated file
:rtype: SustainabilityTaxonomy
"""
root = SustainabilityItem(id=0, name=version_name)
if filetype == 'excel':
# if the name corresponds to one of the existing taxonomies, get file from taxonomies directory
if filepath in BUILTIN_TAXONOMIES:
root.name = TAXONOMIES_DESC[filepath]
items_df = pd.read_excel(os.path.dirname(os.path.abspath(__file__)) + "/taxonomies/" + filepath + ".xlsx")
version_name = TAXONOMIES_DESC[filepath]
else:
items_df = pd.read_excel(filepath)
elif filetype == 'json':
items_df = pd.read_json(filepath)
else:
raise FileTypeNotSupportedError(f"{filetype} is currently not supported")
items_df.replace({np.nan: None}, inplace=True)
items = [root]
# Consider any additional columns as meta-data
all_columns = items_df.columns
meta_data_col = [col for col in all_columns
if col not in ["id", "name", "level", "grouping",
"parent", "score", "weight", "children"]]
# create sustainability items
for item in items_df.to_dict('records'):
# create item with respective attributes
if meta:
meta_dict = {key: item[key] for key in meta_data_col}
else:
meta_dict = {}
sustainability_item = SustainabilityItem(id=item['id'],
name=item['name'],
level=item['level'],
grouping=item['grouping'],
parent=item['parent'],
score=item['score'],
weight=item['weight'],
children=item['children'],
meta_data=meta_dict)
# if parent is not None, update parent value with SustainabilityItem
# Update parent children
if item['parent'] is not None:
parent = items[int(item['parent'])]
sustainability_item.parent = parent
if not isinstance(parent.children, list):
# convert string list to a list
parent.children = ast.literal_eval(parent.children)
for i in range(len(parent.children)):
if sustainability_item.id == parent.children[i]:
child_idx = i
parent.children[child_idx] = sustainability_item
else:
if items[0].children is None:
items[0].children = []
items[0].children.append(sustainability_item)
sustainability_item.parent = items[0]
items.append(sustainability_item)
return SustainabilityTaxonomy(items[0], version_name, version_num)