File manager

File manager - Edit - /usr/local/lib/python3.9/dist-packages/pythainlp/transliterate/royin.py

Back
# -- coding: utf-8 -- # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 """ The Royal Thai General System of Transcription (RTGS) is the official system for rendering Thai words in the Latin alphabet. It was published by the Royal Institute of Thailand. :See Also: * `Wikipedia <https://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription>`_ """ import re from pythainlp import thai_consonants, word_tokenize # vowel _vowel_patterns = """เียว,\\1iao แ็ว,\\1aeo เือย,\\1ueai แว,\\1aeo เ็ว,\\1eo เว,\\1eo ิว,\\1io วย,\\1uai เย,\\1oei อย,\\1oi โย,\\1oi ุย,\\1ui าย,\\1ai ไย,\\1ai ัย,\\1ai ไ,\\1\\2ai ไ,\\1ai ใ,\\1ai ว,\\1ua\\2 ัวะ,\\1ua ัว,\\1ua เือะ,\\1uea เือ,\\1uea เียะ,\\1ia เีย,\\1ia เอะ,\\1oe เอ,\\1oe เิ,\\1oe อ,\\1o เาะ,\\1o เ็,\\1e โะ,\\1o โ,\\1o แะ,\\1ae แ,\\1ae เาะ,\\1e าว,\\1ao เา,\\1ao เ,\\1e ู,\\1u ุ,\\1u ื,\\1ue ึ,\\1ue ี,\\1i ิ,\\1i ำ,\\1am า,\\1a ั,\\1a ะ,\\1a #ฤ,\\1rue $ฤ,\\1ri""" _vowel_patterns = _vowel_patterns.replace("", f"([{thai_consonants}])") _vowel_patterns = _vowel_patterns.replace("#", "([คนพมห])") _vowel_patterns = _vowel_patterns.replace("$", "([กตทปศส])") _VOWELS = [x.split(",") for x in _vowel_patterns.split("\n")] # พยัญชนะ ต้น สะกด _CONSONANTS = { "ก": ["k", "k"], "ข": ["kh", "k"], "ฃ": ["kh", "k"], "ค": ["kh", "k"], "ฅ": ["kh", "k"], "ฆ": ["kh", "k"], "ง": ["ng", "ng"], "จ": ["ch", "t"], "ฉ": ["ch", "t"], "ช": ["ch", "t"], "ซ": ["s", "t"], "ฌ": ["ch", "t"], "ญ": ["y", "n"], "ฎ": ["d", "t"], "ฏ": ["t", "t"], "ฐ": ["th", "t"], # ฑ พยัญชนะต้น เป็น d ได้ "ฑ": ["th", "t"], "ฒ": ["th", "t"], "ณ": ["n", "n"], "ด": ["d", "t"], "ต": ["t", "t"], "ถ": ["th", "t"], "ท": ["th", "t"], "ธ": ["th", "t"], "น": ["n", "n"], "บ": ["b", "p"], "ป": ["p", "p"], "ผ": ["ph", "p"], "ฝ": ["f", "p"], "พ": ["ph", "p"], "ฟ": ["f", "p"], "ภ": ["ph", "p"], "ม": ["m", "m"], "ย": ["y", ""], "ร": ["r", "n"], "ฤ": ["rue", ""], "ล": ["l", "n"], "ว": ["w", ""], "ศ": ["s", "t"], "ษ": ["s", "t"], "ส": ["s", "t"], "ห": ["h", ""], "ฬ": ["l", "n"], "อ": ["", ""], "ฮ": ["h", ""], } _THANTHAKHAT = "\u0e4c" _RE_CONSONANT = re.compile(f"[{thai_consonants}]") _RE_NORMALIZE = re.compile( f"จน์\|มณ์\|ณฑ์\|ทร์\|ตร์\|[{thai_consonants}]{_THANTHAKHAT}\|" f"[{thai_consonants}][\u0e30-\u0e39]{_THANTHAKHAT}" # Paiyannoi, Maiyamok, Tonemarks, Thanthakhat, Nikhahit, other signs r"\|[\u0e2f\u0e46\u0e48-\u0e4f\u0e5a\u0e5b]" ) def _normalize(word: str) -> str: """ Remove silence, no sound, and tonal characters. ตัดอักษรที่ไม่ออกเสียง (การันต์ ไปยาลน้อย ไม้ยมก*) และวรรณยุกต์ทิ้ง """ return _RE_NORMALIZE.sub("", word) def _replace_vowels(word: str) -> str: for vowel in _VOWELS: word = re.sub(vowel[0], vowel[1], word) return word def _replace_consonants(word: str, consonants: str) -> str: _HO_HIP = "\u0e2b" # ห _RO_RUA = "\u0e23" # ร _DOUBLE_RO_RUA = _RO_RUA + _RO_RUA if not consonants: return word skip = False mod_chars = [] j = 0 # j is the index of consonants for i in range(len(word)): if skip: skip = False j += 1 elif word[i] not in _CONSONANTS: # word[i] is not a Thai consonant. mod_chars.append(word[i]) elif ( len(mod_chars) == 0 and word[i] == _HO_HIP and len(consonants) != 1 ): # Skip HO HIP except that HO HIP is the only one consonant j += 1 elif ( len(mod_chars) == 0 ): # The first character must be an initial consonant. mod_chars.append(_CONSONANTS[consonants[j]][0]) j += 1 elif word[i:] == _DOUBLE_RO_RUA: # Double RO RUA is in end of word skip = True mod_chars.append("a") mod_chars.append("n") j += 1 elif word[i : i + 2] == _DOUBLE_RO_RUA: skip = True mod_chars.append("a") j += 1 else: # Assume that the rest are final consonants. mod_chars.append(_CONSONANTS[consonants[j]][1]) j += 1 return "".join(mod_chars) # support function for romanize() def _romanize(word: str) -> str: word = _replace_vowels(_normalize(word)) consonants = _RE_CONSONANT.findall(word) # 2-character word, all consonants if len(word) == 2 and len(consonants) == 2: word = list(word) word.insert(1, "o") word = "".join(word) word = _replace_consonants(word, consonants) return word def romanize(text: str) -> str: """Render Thai words in Latin alphabet, using RTGS Royal Thai General System of Transcription (RTGS), is the official system by the Royal Institute of Thailand. :param text: Thai text to be romanized :type text: str :return: A string of Thai words rendered in the Latin alphabet :rtype: str """ words = word_tokenize(text) romanized_words = [_romanize(word) for word in words] return "".join(romanized_words)

| ver. 1.4 | Github | . | PHP 7.4.33 | Generation time: 0.53 | proxy | phpinfo | Settings