File manager

File manager - Edit - /usr/local/lib/python3.9/dist-packages/pythainlp/parse/transformers_ud.py

Back
# -- coding: utf-8 -- """ TransformersUD Author: Prof. Koichi Yasuoka This tagger is provided under the terms of the apache-2.0 License. The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head GitHub: https://github.com/KoichiYasuoka """ import os from typing import List, Union import numpy import torch import ufal.chu_liu_edmonds from transformers import ( AutoConfig, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline, ) from transformers.utils import cached_file class Parse: def __init__( self, model: str = "KoichiYasuoka/deberta-base-thai-ud-head" ) -> None: if model is None: model = "KoichiYasuoka/deberta-base-thai-ud-head" self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForQuestionAnswering.from_pretrained(model) x = AutoModelForTokenClassification.from_pretrained if os.path.isdir(model): d, t = ( x(os.path.join(model, "deprel")), x(os.path.join(model, "tagger")), ) else: c = AutoConfig.from_pretrained( cached_file(model, "deprel/config.json") ) d = x(cached_file(model, "deprel/pytorch_model.bin"), config=c) s = AutoConfig.from_pretrained( cached_file(model, "tagger/config.json") ) t = x(cached_file(model, "tagger/pytorch_model.bin"), config=s) self.deprel = TokenClassificationPipeline( model=d, tokenizer=self.tokenizer, aggregation_strategy="simple" ) self.tagger = TokenClassificationPipeline( model=t, tokenizer=self.tokenizer ) def __call__( self, text: str, tag: str = "str" ) -> Union[List[List[str]], str]: w = [ (t["start"], t["end"], t["entity_group"]) for t in self.deprel(text) ] z, n = ( {t["start"]: t["entity"].split("\|") for t in self.tagger(text)}, len(w), ) r, m = ( [text[s:e] for s, e, p in w], numpy.full((n + 1, n + 1), numpy.nan), ) v, c = self.tokenizer(r, add_special_tokens=False)["input_ids"], [] for i, t in enumerate(v): q = ( [self.tokenizer.cls_token_id] + t + [self.tokenizer.sep_token_id] ) c.append( [q] + v[0:i] + [[self.tokenizer.mask_token_id]] + v[i + 1 :] + [[q[-1]]] ) b = [[len(sum(x[0 : j + 1], [])) for j in range(len(x))] for x in c] with torch.no_grad(): d = self.model( input_ids=torch.tensor([sum(x, []) for x in c]), token_type_ids=torch.tensor( [[0] * x[0] + [1] * (x[-1] - x[0]) for x in b] ), ) s, e = d.start_logits.tolist(), d.end_logits.tolist() for i in range(n): for j in range(n): m[i + 1, 0 if i == j else j + 1] = ( s[i][b[i][j]] + e[i][b[i][j + 1] - 1] ) h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] if [0 for i in h if i == 0] != [0]: i = ([p for s, e, p in w] + ["root"]).index("root") j = i + 1 if i < n else numpy.nanargmax(m[:, 0]) m[0:j, 0] = m[j + 1 :, 0] = numpy.nan h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] u = "" if tag == "list": _tag_data = [] for i, (s, e, p) in enumerate(w, 1): p = "root" if h[i] == 0 else "dep" if p == "root" else p _tag_data.append( [ str(i), r[i - 1], "_", z[s][0][2:], "_", "\|".join(z[s][1:]), str(h[i]), p, "_", "_" if i < n and e < w[i][0] else "SpaceAfter=No", ] ) return _tag_data for i, (s, e, p) in enumerate(w, 1): p = "root" if h[i] == 0 else "dep" if p == "root" else p u += ( "\t".join( [ str(i), r[i - 1], "_", z[s][0][2:], "_", "\|".join(z[s][1:]), str(h[i]), p, "_", "_" if i < n and e < w[i][0] else "SpaceAfter=No", ] ) + "\n" ) return u + "\n"

| ver. 1.4 | Github | . | PHP 7.4.33 | Generation time: 0.63 | proxy | phpinfo | Settings