Arquivos
cuneiform-sign-detection-code/lib/transliteration/TransliterationSet.py
T
2020-11-19 12:18:53 +01:00

52 linhas
2.0 KiB
Python

import pandas as pd
from ..utils.path_utils import *
class TransliterationSet:
def __init__(self, collections=[], relative_path='../../'):
# load list of coll_tl_df
list_coll_tl_df = []
for collection in collections:
coll_tl_file = '{}data/transliterations/transliterations_{}.csv'.format(relative_path, collection)
# check if transliteration exists
if os.path.isfile(coll_tl_file):
print('Transliteration file {} found!'.format(coll_tl_file))
# load transliteration
coll_tl_df = pd.read_csv(coll_tl_file)
# select subset of columns
coll_tl_df = coll_tl_df[['segm_idx', 'tablet_CDLI', 'train_label', 'mzl_label', 'line_idx', 'pos_idx', 'status']]
coll_tl_df['lbl'] = coll_tl_df['train_label']
coll_tl_df['mzl_lbl'] = coll_tl_df['mzl_label']
else:
print('Transliteration file {} NOT found!'.format(coll_tl_file))
coll_tl_df = pd.DataFrame()
# append coll_tl_df to list
list_coll_tl_df.append(coll_tl_df)
# make accessible
self.collections = collections
self.list_coll_tl_df = list_coll_tl_df
def get_tl_df(self, seg_rec, verbose=True):
# init empty tl
num_lines = 0
tl_df = pd.DataFrame()
# select corresponding coll_tl_df
collection = seg_rec.collection
coll_idx = self.collections.index(collection)
coll_tl_df = self.list_coll_tl_df[coll_idx]
# check if transliterations available
if len(coll_tl_df) > 0:
# select corresponding tl_df slice in coll_df
tl_df = coll_tl_df[coll_tl_df.segm_idx == seg_rec.name]
# compute number lines
num_lines = tl_df.line_idx.nunique()
# report if transliteration is missing
if len(tl_df) == 0:
if verbose:
print('No transliteration found for {}!'.format(seg_rec.tablet_CDLI))
return tl_df, num_lines