Spaces:
Runtime error
Runtime error
Matthijs Hollemans
commited on
Commit
•
f0839e8
1
Parent(s):
ee061f2
here we go!
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +2 -2
- app.py +140 -0
- requirements.txt +8 -0
- uroman/.gitignore +35 -0
- uroman/LICENSE.txt +11 -0
- uroman/README.md +165 -0
- uroman/README.txt +141 -0
- uroman/bin/de-accent.pl +201 -0
- uroman/bin/string-distance.pl +99 -0
- uroman/bin/uroman-quick.pl +58 -0
- uroman/bin/uroman-tsv.sh +28 -0
- uroman/bin/uroman.pl +138 -0
- uroman/data/Chinese_to_Pinyin.txt +0 -0
- uroman/data/Scripts.txt +135 -0
- uroman/data/UnicodeData.txt +0 -0
- uroman/data/UnicodeDataOverwrite.txt +442 -0
- uroman/data/romanization-table-arabic-block.txt +179 -0
- uroman/data/romanization-table.txt +2019 -0
- uroman/data/romanization-table.v1.2.1.txt +814 -0
- uroman/data/string-distance-cost-rules.txt +896 -0
- uroman/lib/JSON.pm +2317 -0
- uroman/lib/JSON/backportPP.pm +2806 -0
- uroman/lib/JSON/backportPP/Boolean.pm +27 -0
- uroman/lib/JSON/backportPP/Compat5005.pm +131 -0
- uroman/lib/JSON/backportPP/Compat5006.pm +173 -0
- uroman/lib/NLP/Chinese.pm +239 -0
- uroman/lib/NLP/English.pm +0 -0
- uroman/lib/NLP/Romanizer.pm +2020 -0
- uroman/lib/NLP/UTF8.pm +1404 -0
- uroman/lib/NLP/stringDistance.pm +724 -0
- uroman/lib/NLP/utilities.pm +0 -0
- uroman/tarballs/uroman-v1.0.tar.gz +3 -0
- uroman/tarballs/uroman-v1.1.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.4.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.5.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.6.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.7.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.tar.gz +3 -0
- uroman/test/multi-script.txt +32 -0
- uroman/test/multi-script.uroman-ref.txt +32 -0
- uroman/test/string-similarity-test-input.txt +7 -0
- uroman/test/string-similarity-test-output-ref.txt +8 -0
- uroman/text/amh.txt +7 -0
- uroman/text/ara.txt +3 -0
- uroman/text/ben.txt +8 -0
- uroman/text/bod.txt +3 -0
- uroman/text/egy.txt +5 -0
- uroman/text/ell.txt +8 -0
- uroman/text/fas.txt +6 -0
- uroman/text/heb.txt +15 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: indigo
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
+
title: MMS-TTS Demo
|
3 |
+
emoji: 🥳
|
4 |
colorFrom: indigo
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
from transformers import VitsModel, VitsTokenizer
|
9 |
+
|
10 |
+
|
11 |
+
models = {
|
12 |
+
"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
|
13 |
+
"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
|
14 |
+
"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
|
15 |
+
}
|
16 |
+
|
17 |
+
tokenizers = {
|
18 |
+
"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
|
19 |
+
"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
|
20 |
+
"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
+
# For certain checkpoints, the text needs to be romanized.
|
25 |
+
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
|
26 |
+
# This needs to be installed in the folder "uroman"
|
27 |
+
def uromanize(text, uroman_pl):
|
28 |
+
iso = "xxx"
|
29 |
+
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
|
30 |
+
with open(tf.name, "w") as f:
|
31 |
+
f.write("\n".join([text]))
|
32 |
+
cmd = f"perl " + uroman_pl
|
33 |
+
cmd += f" -l {iso} "
|
34 |
+
cmd += f" < {tf.name} > {tf2.name}"
|
35 |
+
os.system(cmd)
|
36 |
+
outtexts = []
|
37 |
+
with open(tf2.name) as f:
|
38 |
+
for line in f:
|
39 |
+
line = re.sub(r"\s+", " ", line).strip()
|
40 |
+
outtexts.append(line)
|
41 |
+
outtext = outtexts[0]
|
42 |
+
return outtext
|
43 |
+
|
44 |
+
|
45 |
+
def predict(text, language=None):
|
46 |
+
if len(text.strip()) == 0:
|
47 |
+
return (16000, np.zeros(0).astype(np.int16))
|
48 |
+
|
49 |
+
if language == "Korean":
|
50 |
+
uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
|
51 |
+
text = uromanize(text, uroman_pl)
|
52 |
+
|
53 |
+
tokenizer = tokenizers[language]
|
54 |
+
inputs = tokenizer(text, return_tensors="pt")
|
55 |
+
input_ids = inputs["input_ids"]
|
56 |
+
|
57 |
+
if language != "Korean":
|
58 |
+
text = tokenizer.batch_decode(input_ids)[0]
|
59 |
+
|
60 |
+
model = models[language]
|
61 |
+
with torch.no_grad():
|
62 |
+
outputs = model(input_ids)
|
63 |
+
|
64 |
+
speech = outputs.audio[0]
|
65 |
+
speech = (speech.numpy() * 32767).astype(np.int16)
|
66 |
+
return (16000, speech), text
|
67 |
+
|
68 |
+
|
69 |
+
title = "MMS-TTS speech synthesis"
|
70 |
+
|
71 |
+
description = """
|
72 |
+
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
|
73 |
+
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
|
74 |
+
over 1000 text-to-speech (TTS) models.
|
75 |
+
|
76 |
+
This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
|
77 |
+
model, this code can also be used to run VITS checkpoints.
|
78 |
+
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).
|
79 |
+
|
80 |
+
As the model performs random sampling, the generated speech is slightly different each time.
|
81 |
+
The voice may also vary between runs, or sometimes even in the same sentence.
|
82 |
+
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
|
83 |
+
are not conditioned on a speaker ID.)
|
84 |
+
"""
|
85 |
+
|
86 |
+
article = """
|
87 |
+
<div style='margin:20px auto;'>
|
88 |
+
|
89 |
+
<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
|
90 |
+
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
|
91 |
+
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
|
92 |
+
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
|
93 |
+
</p>
|
94 |
+
|
95 |
+
<pre>
|
96 |
+
@article{pratap2023mms,
|
97 |
+
title={Scaling Speech Technology to 1,000+ Languages},
|
98 |
+
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
|
99 |
+
journal={arXiv},
|
100 |
+
year={2023}
|
101 |
+
}
|
102 |
+
</pre>
|
103 |
+
|
104 |
+
</div>
|
105 |
+
"""
|
106 |
+
|
107 |
+
examples = [
|
108 |
+
["It is not in the stars to hold our destiny but in ourselves.", "English"],
|
109 |
+
["The octopus and Oliver went to the opera in October.", "English"],
|
110 |
+
["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
|
111 |
+
["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
|
112 |
+
["A synonym for cinnamon is a cinnamon synonym.", "English"],
|
113 |
+
["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],
|
114 |
+
|
115 |
+
["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
|
116 |
+
["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],
|
117 |
+
|
118 |
+
["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
|
119 |
+
]
|
120 |
+
|
121 |
+
gr.Interface(
|
122 |
+
fn=predict,
|
123 |
+
inputs=[
|
124 |
+
gr.Text(label="Input Text"),
|
125 |
+
gr.Radio(label="Language", choices=[
|
126 |
+
"English",
|
127 |
+
"German",
|
128 |
+
"Korean",
|
129 |
+
],
|
130 |
+
value="English"),
|
131 |
+
],
|
132 |
+
outputs=[
|
133 |
+
gr.Audio(label="Generated Speech", type="numpy"),
|
134 |
+
gr.Text(label="Processed text"),
|
135 |
+
],
|
136 |
+
title=title,
|
137 |
+
description=description,
|
138 |
+
article=article,
|
139 |
+
examples=examples,
|
140 |
+
).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/hollance/transformers.git@vits
|
2 |
+
torch
|
3 |
+
torchaudio
|
4 |
+
soundfile
|
5 |
+
librosa
|
6 |
+
samplerate
|
7 |
+
resampy
|
8 |
+
sentencepiece
|
uroman/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!Build/
|
2 |
+
.last_cover_stats
|
3 |
+
/META.yml
|
4 |
+
/META.json
|
5 |
+
/MYMETA.*
|
6 |
+
*.o
|
7 |
+
*.pm.tdy
|
8 |
+
*.bs
|
9 |
+
|
10 |
+
# Devel::Cover
|
11 |
+
cover_db/
|
12 |
+
|
13 |
+
# Devel::NYTProf
|
14 |
+
nytprof.out
|
15 |
+
|
16 |
+
# Dizt::Zilla
|
17 |
+
/.build/
|
18 |
+
|
19 |
+
# Module::Build
|
20 |
+
_build/
|
21 |
+
Build
|
22 |
+
Build.bat
|
23 |
+
|
24 |
+
# Module::Install
|
25 |
+
inc/
|
26 |
+
|
27 |
+
# ExtUtils::MakeMaker
|
28 |
+
/blib/
|
29 |
+
/_eumm/
|
30 |
+
/*.gz
|
31 |
+
/Makefile
|
32 |
+
/Makefile.old
|
33 |
+
/MANIFEST.bak
|
34 |
+
/pm_to_blib
|
35 |
+
/*.zip
|
uroman/LICENSE.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4 |
+
|
5 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6 |
+
|
7 |
+
Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
|
8 |
+
Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
|
9 |
+
|
10 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
11 |
+
|
uroman/README.md
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# uroman
|
2 |
+
|
3 |
+
*uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
|
4 |
+
|
5 |
+
Version: 1.2.8
|
6 |
+
Release date: April 23, 2021
|
7 |
+
Author: Ulf Hermjakob, USC Information Sciences Institute
|
8 |
+
|
9 |
+
|
10 |
+
### Usage
|
11 |
+
```bash
|
12 |
+
$ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
|
13 |
+
where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
|
14 |
+
grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
|
15 |
+
--chart specifies chart output (in JSON format) to represent alternative romanizations.
|
16 |
+
--no-cache disables caching.
|
17 |
+
```
|
18 |
+
### Examples
|
19 |
+
```bash
|
20 |
+
$ bin/uroman.pl < text/zho.txt
|
21 |
+
$ bin/uroman.pl -l tur < text/tur.txt
|
22 |
+
$ bin/uroman.pl -l heb --chart < text/heb.txt
|
23 |
+
$ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
|
24 |
+
```
|
25 |
+
|
26 |
+
Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
|
27 |
+
Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
|
28 |
+
Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
|
29 |
+
Yiddish will improve romanization for those languages as some letters in those
|
30 |
+
languages have different sound values from other languages using the same script
|
31 |
+
(French, Russian, Hebrew respectively).
|
32 |
+
No effect for other languages in this version.
|
33 |
+
|
34 |
+
### Bibliography
|
35 |
+
Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
|
36 |
+
|
37 |
+
### Change History
|
38 |
+
Changes in version 1.2.8
|
39 |
+
* Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
|
40 |
+
* Improved support for Georgian.
|
41 |
+
* Preserve various symbols (as opposed to mapping to the symbols' names).
|
42 |
+
* Various small improvements.
|
43 |
+
|
44 |
+
Changes in version 1.2.7
|
45 |
+
* Improved support for Pashto.
|
46 |
+
|
47 |
+
Changes in version 1.2.6
|
48 |
+
* Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
|
49 |
+
* Added support for English Braille.
|
50 |
+
* Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
|
51 |
+
reflecting a casual style that many native speakers of those languages use
|
52 |
+
when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
|
53 |
+
rather than phonetically motivated combinations of letters (e.g. "sh").
|
54 |
+
* When a line starts with "::lcode xyz ", the new uroman version will switch to
|
55 |
+
that language for that line. This is used for the new reference test file.
|
56 |
+
* Various small improvements.
|
57 |
+
|
58 |
+
Changes in version 1.2.5
|
59 |
+
* Improved support for Armenian and eight languages using Cyrillic scripts.
|
60 |
+
-- For Serbian and Macedonian, which are often written in both Cyrillic
|
61 |
+
and Latin scripts, uroman will map both official versions to the same
|
62 |
+
romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
|
63 |
+
properly reflects the pronunciation of the city's name).
|
64 |
+
For both Serbian and Macedonian, casual writers often use a simplified
|
65 |
+
Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
|
66 |
+
and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
|
67 |
+
other such pairs. The casual romanization can be simulated by using
|
68 |
+
alternative uroman language codes "srp2" and "mkd2", which romanize
|
69 |
+
both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
|
70 |
+
* Various small improvements.
|
71 |
+
|
72 |
+
Changes in version 1.2.4
|
73 |
+
* Bug-fix that generated two emtpy lines for each empty line in cache mode.
|
74 |
+
|
75 |
+
Changes in version 1.2
|
76 |
+
* Run-time improvement based on (1) token-based caching and (2) shortcut
|
77 |
+
romanization (identity) of ASCII strings for default 1-best (non-chart)
|
78 |
+
output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
|
79 |
+
large size texts.
|
80 |
+
* Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
|
81 |
+
languages.
|
82 |
+
* Richer lattice structure (more alternatives) for "Romanization" of English
|
83 |
+
to support better matching to romanizations of other languages.
|
84 |
+
Changes output only when --chart option is specified. No change in output for
|
85 |
+
default 1-best output, which for ASCII characters is always the input string.
|
86 |
+
|
87 |
+
Changes in version 1.1 (major upgrade)
|
88 |
+
* Offers chart output (in JSON format) to represent alternative romanizations.
|
89 |
+
-- Location of first character is defined to be "line: 1, start:0, end:0".
|
90 |
+
* Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
|
91 |
+
* Improved web-interface at http://www.isi.edu/~ulf/uroman.html
|
92 |
+
-- Shows corresponding original and romanization text in red
|
93 |
+
when hovering over a text segment.
|
94 |
+
-- Shows alternative romanizations when hovering over romanized text
|
95 |
+
marked by dotted underline.
|
96 |
+
-- Added right-to-left script detection and improved display for right-to-left
|
97 |
+
script text (as determined line by line).
|
98 |
+
-- On-page support for some scripts that are often not pre-installed on users'
|
99 |
+
computers (Burmese, Egyptian, Klingon).
|
100 |
+
|
101 |
+
Changes in version 1.0 (major upgrade)
|
102 |
+
* Upgraded principal internal data structure from string to lattice.
|
103 |
+
* Improvements mostly in vowelization of South and Southeast Asian languages.
|
104 |
+
* Vocalic 'r' more consistently treated as vowel (no additional vowel added).
|
105 |
+
* Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
|
106 |
+
* Japanese Katakana middle dots now mapped to ASCII space.
|
107 |
+
* Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
|
108 |
+
* Some corrections regarding analysis of Chinese numbers.
|
109 |
+
* Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
|
110 |
+
* Zero-width characters dropped, except line/sentence-initial byte order marks.
|
111 |
+
* Spaces normalized to ASCII space.
|
112 |
+
* Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
|
113 |
+
* Tested against previous version of uroman with a new uroman visual diff tool.
|
114 |
+
* Almost an order of magnitude faster.
|
115 |
+
|
116 |
+
Changes in version 0.7 (minor upgrade)
|
117 |
+
* Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
|
118 |
+
Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
|
119 |
+
Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
|
120 |
+
or Chinese characters in Uyghur texts.
|
121 |
+
|
122 |
+
Changes in version 0.6 (minor upgrade)
|
123 |
+
* Added support for two letter characters used in Uzbek:
|
124 |
+
(1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
|
125 |
+
(2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
|
126 |
+
Both are now mapped to "'" (plain ASCII apostrophe).
|
127 |
+
* Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
|
128 |
+
even when they are not preceded by "ئ" (yeh with hamza above).
|
129 |
+
* Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
|
130 |
+
("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
|
131 |
+
* Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
|
132 |
+
However, it is strongly recommended to normalize any presentation form Arabic letters
|
133 |
+
to their non-presentation form before calling uroman.
|
134 |
+
* Added force flush directive ($|=1;).
|
135 |
+
|
136 |
+
Changes in version 0.5 (minor upgrade)
|
137 |
+
* Improvements for Uyghur (make sure to use language option: -l uig)
|
138 |
+
|
139 |
+
Changes in version 0.4 (minor upgrade)
|
140 |
+
* Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
|
141 |
+
* Minor change for Arabic (added "alef+fathatan" = "an")
|
142 |
+
|
143 |
+
New features in version 0.3
|
144 |
+
* Covers Mandarin (Chinese)
|
145 |
+
* Improved romanization for numerous languages
|
146 |
+
* Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
|
147 |
+
* Maps from native digits to Western numbers
|
148 |
+
* Faster for South Asian languages
|
149 |
+
|
150 |
+
### Other features
|
151 |
+
* Web interface: http://www.isi.edu/~ulf/uroman.html
|
152 |
+
* Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
|
153 |
+
|
154 |
+
### Limitations
|
155 |
+
* The current version of uroman has a few limitations, some of which we plan to address in future versions.
|
156 |
+
For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
|
157 |
+
For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
|
158 |
+
For Linear B, only phonetic syllabic characters are romanized.
|
159 |
+
For some other extinct scripts such as cuneiform, no romanization is provided.
|
160 |
+
* A romanizer is not a full transliterator. For example, this version of
|
161 |
+
uroman does not vowelize text that lacks explicit vowelization such as
|
162 |
+
normal text in Arabic and Hebrew (without diacritics/points).
|
163 |
+
|
164 |
+
### Acknowledgments
|
165 |
+
This research is based upon work supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via contract # FA8650-17-C-9116, and by research sponsored by Air Force Research Laboratory (AFRL) under agreement number FA8750-19-1-1000. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of ODNI, IARPA, Air Force Laboratory, DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.
|
uroman/README.txt
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
uroman version 1.2.8
|
2 |
+
Release date: April 23, 2021
|
3 |
+
Author: Ulf Hermjakob, USC Information Sciences Institute
|
4 |
+
|
5 |
+
uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
|
6 |
+
|
7 |
+
Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
|
8 |
+
where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
|
9 |
+
grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
|
10 |
+
--chart specifies chart output (in JSON format) to represent alternative romanizations.
|
11 |
+
--no-cache disables caching.
|
12 |
+
Examples: bin/uroman.pl < text/zho.txt
|
13 |
+
bin/uroman.pl -l tur < text/tur.txt
|
14 |
+
bin/uroman.pl -l heb --chart < text/heb.txt
|
15 |
+
bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
|
16 |
+
|
17 |
+
Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
|
18 |
+
Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
|
19 |
+
Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
|
20 |
+
will improve romanization for those languages as some letters in those languages
|
21 |
+
have different sound values from other languages using the same script.
|
22 |
+
No effect for other languages in this version.
|
23 |
+
|
24 |
+
Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
|
25 |
+
|
26 |
+
Changes in version 1.2.8
|
27 |
+
* Improved support for Georgian.
|
28 |
+
* Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
|
29 |
+
* Preserve various symbols (as opposed to mapping to the symbols' names).
|
30 |
+
* Various small improvements.
|
31 |
+
Changes in version 1.2.7
|
32 |
+
* Improved support for Pashto.
|
33 |
+
Changes in version 1.2.6
|
34 |
+
* Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
|
35 |
+
* Added support for English Braille.
|
36 |
+
* Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
|
37 |
+
reflecting a casual style that many native speakers of those languages use
|
38 |
+
when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
|
39 |
+
rather than phonetically motivated combinations of letters (e.g. "sh").
|
40 |
+
* When a line starts with "::lcode xyz ", the new uroman version will switch to
|
41 |
+
that language for that line. This is used for the new reference test file.
|
42 |
+
* Various small improvements.
|
43 |
+
Changes in version 1.2.5
|
44 |
+
* Improved support for Armenian and eight languages using Cyrillic scripts.
|
45 |
+
-- For Serbian and Macedonian, which are often written in both Cyrillic
|
46 |
+
and Latin scripts, uroman will map both official versions to the same
|
47 |
+
romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
|
48 |
+
properly reflects the pronunciation of the city's name).
|
49 |
+
For both Serbian and Macedonian, casual writers often use a simplified
|
50 |
+
Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
|
51 |
+
and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
|
52 |
+
other such pairs. The casual romanization can be simulated by using
|
53 |
+
alternative uroman language codes "srp2" and "mkd2", which romanize
|
54 |
+
both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
|
55 |
+
* Various small improvements.
|
56 |
+
Changes in version 1.2.4
|
57 |
+
* Added support for Tifinagh (a script used for Berber languages).
|
58 |
+
* Bug-fix that generated two emtpy lines for each empty line in cache mode.
|
59 |
+
Changes in version 1.2.3
|
60 |
+
* Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
|
61 |
+
Changes in version 1.2
|
62 |
+
* Run-time improvement based on (1) token-based caching and (2) shortcut
|
63 |
+
romanization (identity) of ASCII strings for default 1-best (non-chart)
|
64 |
+
output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
|
65 |
+
large size texts.
|
66 |
+
* Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
|
67 |
+
languages.
|
68 |
+
* Richer lattice structure (more alternatives) for "Romanization" of English
|
69 |
+
to support better matching to romanizations of other languages.
|
70 |
+
Changes output only when --chart option is specified. No change in output for
|
71 |
+
default 1-best output, which for ASCII characters is always the input string.
|
72 |
+
Changes in version 1.1 (major upgrade)
|
73 |
+
* Offers chart output (in JSON format) to represent alternative romanizations.
|
74 |
+
-- Location of first character is defined to be "line: 1, start:0, end:0".
|
75 |
+
* Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
|
76 |
+
* Improved web-interface at http://www.isi.edu/~ulf/uroman.html
|
77 |
+
-- Shows corresponding original and romanization text in red
|
78 |
+
when hovering over a text segment.
|
79 |
+
-- Shows alternative romanizations when hovering over romanized text
|
80 |
+
marked by dotted underline.
|
81 |
+
-- Added right-to-left script detection and improved display for right-to-left
|
82 |
+
script text (as determined line by line).
|
83 |
+
-- On-page support for some scripts that are often not pre-installed on users'
|
84 |
+
computers (Burmese, Egyptian, Klingon).
|
85 |
+
Changes in version 1.0 (major upgrade)
|
86 |
+
* Upgraded principal internal data structure from string to lattice.
|
87 |
+
* Improvements mostly in vowelization of South and Southeast Asian languages.
|
88 |
+
* Vocalic 'r' more consistently treated as vowel (no additional vowel added).
|
89 |
+
* Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
|
90 |
+
* Japanese Katakana middle dots now mapped to ASCII space.
|
91 |
+
* Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
|
92 |
+
* Some corrections regarding analysis of Chinese numbers.
|
93 |
+
* Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
|
94 |
+
* Zero-width characters dropped, except line/sentence-initial byte order marks.
|
95 |
+
* Spaces normalized to ASCII space.
|
96 |
+
* Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
|
97 |
+
* Tested against previous version of uroman with a new uroman visual diff tool.
|
98 |
+
* Almost an order of magnitude faster.
|
99 |
+
Changes in version 0.7 (minor upgrade)
|
100 |
+
* Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
|
101 |
+
Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
|
102 |
+
Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
|
103 |
+
or Chinese characters in Uyghur texts.
|
104 |
+
Changes in version 0.6 (minor upgrade)
|
105 |
+
* Added support for two letter characters used in Uzbek:
|
106 |
+
(1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
|
107 |
+
(2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
|
108 |
+
Both are now mapped to "'" (plain ASCII apostrophe).
|
109 |
+
* Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
|
110 |
+
even when they are not preceded by "ئ" (yeh with hamza above).
|
111 |
+
* Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
|
112 |
+
("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
|
113 |
+
* Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
|
114 |
+
However, it is strongly recommended to normalize any presentation form Arabic letters
|
115 |
+
to their non-presentation form before calling uroman.
|
116 |
+
* Added force flush directive ($|=1;).
|
117 |
+
Changes in version 0.5 (minor upgrade)
|
118 |
+
* Improvements for Uyghur (make sure to use language option: -l uig)
|
119 |
+
Changes in version 0.4 (minor upgrade)
|
120 |
+
* Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
|
121 |
+
* Minor change for Arabic (added "alef+fathatan" = "an")
|
122 |
+
New features in version 0.3
|
123 |
+
* Covers Mandarin (Chinese)
|
124 |
+
* Improved romanization for numerous languages
|
125 |
+
* Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
|
126 |
+
* Maps from native digits to Western numbers
|
127 |
+
* Faster for South Asian languages
|
128 |
+
|
129 |
+
Other features
|
130 |
+
* Web interface: http://www.isi.edu/~ulf/uroman.html
|
131 |
+
* Vowelization is provided when locally computable, e.g. for many South Asian
|
132 |
+
languages and Tibetan.
|
133 |
+
|
134 |
+
Limitations
|
135 |
+
* This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
|
136 |
+
This means that Japanese kanji are incorrectly romanized; however, Japanese
|
137 |
+
hiragana and katakana are properly romanized.
|
138 |
+
* A romanizer is not a full transliterator. For example, this version of
|
139 |
+
uroman does not vowelize text that lacks explicit vowelization such as
|
140 |
+
normal text in Arabic and Hebrew (without diacritics/points).
|
141 |
+
|
uroman/bin/de-accent.pl
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
sub print_version {
|
4 |
+
print STDERR "$0 version 1.1\n";
|
5 |
+
print STDERR " Author: Ulf Hermjakob\n";
|
6 |
+
print STDERR " Last changed: March 14, 2011\n";
|
7 |
+
}
|
8 |
+
|
9 |
+
sub print_usage {
|
10 |
+
print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
|
11 |
+
print STDERR " -h or -help\n";
|
12 |
+
print STDERR " -v or -version\n";
|
13 |
+
}
|
14 |
+
|
15 |
+
sub de_accent_string {
|
16 |
+
local($s) = @_;
|
17 |
+
|
18 |
+
# $s =~ tr/A-Z/a-z/;
|
19 |
+
unless (0) {
|
20 |
+
# Latin-1
|
21 |
+
if ($s =~ /\xC3[\x80-\xBF]/) {
|
22 |
+
$s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
|
23 |
+
$s =~ s/Æ/Ae/g;
|
24 |
+
$s =~ s/Ç/C/g;
|
25 |
+
$s =~ s/Ð/D/g;
|
26 |
+
$s =~ s/(È|É|Ê|Ë)/E/g;
|
27 |
+
$s =~ s/(Ì|Í|Î|Ï)/I/g;
|
28 |
+
$s =~ s/Ñ/N/g;
|
29 |
+
$s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
|
30 |
+
$s =~ s/(Ù|Ú|Û|Ü)/U/g;
|
31 |
+
$s =~ s/Þ/Th/g;
|
32 |
+
$s =~ s/Ý/Y/g;
|
33 |
+
$s =~ s/(à|á|â|ã|ä|å)/a/g;
|
34 |
+
$s =~ s/æ/ae/g;
|
35 |
+
$s =~ s/ç/c/g;
|
36 |
+
$s =~ s/(è|é|ê|ë)/e/g;
|
37 |
+
$s =~ s/(ì|í|î|ï)/i/g;
|
38 |
+
$s =~ s/ð/d/g;
|
39 |
+
$s =~ s/ñ/n/g;
|
40 |
+
$s =~ s/(ò|ó|ô|õ|ö)/o/g;
|
41 |
+
$s =~ s/ß/ss/g;
|
42 |
+
$s =~ s/þ/th/g;
|
43 |
+
$s =~ s/(ù|ú|û|ü)/u/g;
|
44 |
+
$s =~ s/(ý|ÿ)/y/g;
|
45 |
+
}
|
46 |
+
# Latin Extended-A
|
47 |
+
if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
|
48 |
+
$s =~ s/(Ā|Ă|Ą)/A/g;
|
49 |
+
$s =~ s/(ā|ă|ą)/a/g;
|
50 |
+
$s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
|
51 |
+
$s =~ s/(ć|ĉ|ċ|č)/c/g;
|
52 |
+
$s =~ s/(Ď|Đ)/D/g;
|
53 |
+
$s =~ s/(ď|đ)/d/g;
|
54 |
+
$s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
|
55 |
+
$s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
|
56 |
+
$s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
|
57 |
+
$s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
|
58 |
+
$s =~ s/(Ĥ|Ħ)/H/g;
|
59 |
+
$s =~ s/(ĥ|ħ)/h/g;
|
60 |
+
$s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
|
61 |
+
$s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
|
62 |
+
$s =~ s/IJ/Ij/g;
|
63 |
+
$s =~ s/ij/ij/g;
|
64 |
+
$s =~ s/Ĵ/J/g;
|
65 |
+
$s =~ s/ĵ/j/g;
|
66 |
+
$s =~ s/Ķ/K/g;
|
67 |
+
$s =~ s/(ķ|ĸ)/k/g;
|
68 |
+
$s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
|
69 |
+
$s =~ s/(ļ|ľ|ŀ|ł)/l/g;
|
70 |
+
$s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
|
71 |
+
$s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
|
72 |
+
$s =~ s/(Ō|Ŏ|Ő)/O/g;
|
73 |
+
$s =~ s/(ō|ŏ|ő)/o/g;
|
74 |
+
$s =~ s/Œ/Oe/g;
|
75 |
+
$s =~ s/œ/oe/g;
|
76 |
+
$s =~ s/(Ŕ|Ŗ|Ř)/R/g;
|
77 |
+
$s =~ s/(ŕ|ŗ|ř)/r/g;
|
78 |
+
$s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
|
79 |
+
$s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
|
80 |
+
$s =~ s/(Ţ|Ť|Ŧ)/T/g;
|
81 |
+
$s =~ s/(ţ|ť|ŧ)/t/g;
|
82 |
+
$s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
|
83 |
+
$s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
|
84 |
+
$s =~ s/Ŵ/W/g;
|
85 |
+
$s =~ s/ŵ/w/g;
|
86 |
+
$s =~ s/(Ŷ|Ÿ)/Y/g;
|
87 |
+
$s =~ s/ŷ/y/g;
|
88 |
+
$s =~ s/(Ź|Ż|Ž)/Z/g;
|
89 |
+
$s =~ s/(ź|ż|ž)/z/g;
|
90 |
+
}
|
91 |
+
# Latin Extended Additional
|
92 |
+
if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
|
93 |
+
$s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
|
94 |
+
$s =~ s/(ḃ|ḅ|ḇ)/b/g;
|
95 |
+
$s =~ s/(ḉ)/c/g;
|
96 |
+
$s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
|
97 |
+
$s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
|
98 |
+
$s =~ s/(ḟ)/f/g;
|
99 |
+
$s =~ s/(ḡ)/g/g;
|
100 |
+
$s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
|
101 |
+
$s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
|
102 |
+
$s =~ s/(ḱ|ḳ|ḵ)/k/g;
|
103 |
+
$s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
|
104 |
+
$s =~ s/(ḿ|ṁ|ṃ)/m/g;
|
105 |
+
$s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
|
106 |
+
$s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
|
107 |
+
$s =~ s/(ṕ|ṗ)/p/g;
|
108 |
+
$s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
|
109 |
+
$s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
|
110 |
+
$s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
|
111 |
+
$s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
|
112 |
+
$s =~ s/(ṽ|ṿ)/v/g;
|
113 |
+
$s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
|
114 |
+
$s =~ s/(ẋ|ẍ)/x/g;
|
115 |
+
$s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
|
116 |
+
$s =~ s/(ẑ|ẓ|ẕ)/z/g;
|
117 |
+
$s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
|
118 |
+
$s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
|
119 |
+
$s =~ s/(Ḉ)/C/g;
|
120 |
+
$s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
|
121 |
+
$s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
|
122 |
+
$s =~ s/(Ḟ)/F/g;
|
123 |
+
$s =~ s/(Ḡ)/G/g;
|
124 |
+
$s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
|
125 |
+
$s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
|
126 |
+
$s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
|
127 |
+
$s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
|
128 |
+
$s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
|
129 |
+
$s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
|
130 |
+
$s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
|
131 |
+
$s =~ s/(Ṕ|Ṗ)/P/g;
|
132 |
+
$s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
|
133 |
+
$s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
|
134 |
+
$s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
|
135 |
+
$s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
|
136 |
+
$s =~ s/(Ṽ|Ṿ)/V/g;
|
137 |
+
$s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
|
138 |
+
$s =~ s/(Ẍ)/X/g;
|
139 |
+
$s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
|
140 |
+
$s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
|
141 |
+
}
|
142 |
+
# Greek letters
|
143 |
+
if ($s =~ /\xCE[\x86-\xAB]/) {
|
144 |
+
$s =~ s/ά/α/g;
|
145 |
+
$s =~ s/έ/ε/g;
|
146 |
+
$s =~ s/ί/ι/g;
|
147 |
+
$s =~ s/ϊ/ι/g;
|
148 |
+
$s =~ s/ΐ/ι/g;
|
149 |
+
$s =~ s/ό/ο/g;
|
150 |
+
$s =~ s/ύ/υ/g;
|
151 |
+
$s =~ s/ϋ/υ/g;
|
152 |
+
$s =~ s/ΰ/υ/g;
|
153 |
+
$s =~ s/ώ/ω/g;
|
154 |
+
$s =~ s/Ά/Α/g;
|
155 |
+
$s =~ s/Έ/Ε/g;
|
156 |
+
$s =~ s/Ή/Η/g;
|
157 |
+
$s =~ s/Ί/Ι/g;
|
158 |
+
$s =~ s/Ϊ/Ι/g;
|
159 |
+
$s =~ s/Ύ/Υ/g;
|
160 |
+
$s =~ s/Ϋ/Υ/g;
|
161 |
+
$s =~ s/Ώ/Ω/g;
|
162 |
+
}
|
163 |
+
# Cyrillic letters
|
164 |
+
if ($s =~ /\xD0[\x80-\xAF]/) {
|
165 |
+
$s =~ s/Ѐ/Е/g;
|
166 |
+
$s =~ s/Ё/Е/g;
|
167 |
+
$s =~ s/Ѓ/Г/g;
|
168 |
+
$s =~ s/Ќ/К/g;
|
169 |
+
$s =~ s/Ѝ/И/g;
|
170 |
+
$s =~ s/Й/И/g;
|
171 |
+
$s =~ s/ѐ/е/g;
|
172 |
+
$s =~ s/ё/е/g;
|
173 |
+
$s =~ s/ѓ/г/g;
|
174 |
+
$s =~ s/ќ/к/g;
|
175 |
+
$s =~ s/ѝ/и/g;
|
176 |
+
$s =~ s/й/и/g;
|
177 |
+
}
|
178 |
+
}
|
179 |
+
return $s;
|
180 |
+
}
|
181 |
+
|
182 |
+
while (@ARGV) {
|
183 |
+
$arg = shift @ARGV;
|
184 |
+
if ($arg =~ /^-*(h|help)$/i) {
|
185 |
+
&print_usage;
|
186 |
+
exit 1;
|
187 |
+
} elsif ($arg =~ /^-*(v|version)$/i) {
|
188 |
+
&print_version;
|
189 |
+
exit 1;
|
190 |
+
} else {
|
191 |
+
print STDERR "Ignoring unrecognized argument $arg\n";
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
$line_number = 0;
|
196 |
+
while (<>) {
|
197 |
+
$line_number++;
|
198 |
+
print &de_accent_string($_);
|
199 |
+
}
|
200 |
+
exit 0;
|
201 |
+
|
uroman/bin/string-distance.pl
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# Author: Ulf Hermjakob
|
4 |
+
# Release date: October 13, 2019
|
5 |
+
|
6 |
+
# Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
|
7 |
+
# Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
|
8 |
+
# Example: string-distance.pl < ../test/string-similarity-test-input.txt
|
9 |
+
# Input format: two strings per line (tab-separated, in Latin script)
|
10 |
+
# Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
|
11 |
+
# Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
|
12 |
+
# Additional output meta info lines at the top are marked with an initial #.
|
13 |
+
#
|
14 |
+
# The script uses data from a string-distance-cost-rules file that lists costs,
|
15 |
+
# where the default cost is "1" with lower costs for differences in vowels,
|
16 |
+
# duplicate consonants, "f" vs. "ph" etc.
|
17 |
+
# Language cost rules can be language-specific and context-sensitive.
|
18 |
+
|
19 |
+
$|=1;
|
20 |
+
|
21 |
+
use FindBin;
|
22 |
+
use Cwd "abs_path";
|
23 |
+
use File::Basename qw(dirname);
|
24 |
+
use File::Spec;
|
25 |
+
|
26 |
+
my $bin_dir = abs_path(dirname($0));
|
27 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
28 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
29 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
30 |
+
|
31 |
+
use lib "$FindBin::Bin/../lib";
|
32 |
+
use List::Util qw(min max);
|
33 |
+
use NLP::utilities;
|
34 |
+
use NLP::stringDistance;
|
35 |
+
$util = NLP::utilities;
|
36 |
+
$sd = NLP::stringDistance;
|
37 |
+
$verbose = 0;
|
38 |
+
$separator = "\t";
|
39 |
+
|
40 |
+
$cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
|
41 |
+
|
42 |
+
$lang_code1 = "eng";
|
43 |
+
$lang_code2 = "eng";
|
44 |
+
%ht = ();
|
45 |
+
|
46 |
+
while (@ARGV) {
|
47 |
+
$arg = shift @ARGV;
|
48 |
+
if ($arg =~ /^-+lc1$/) {
|
49 |
+
$lang_code_candidate = shift @ARGV;
|
50 |
+
$lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
51 |
+
} elsif ($arg =~ /^-+lc2$/) {
|
52 |
+
$lang_code_candidate = shift @ARGV;
|
53 |
+
$lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
54 |
+
} elsif ($arg =~ /^-+(v|verbose)$/) {
|
55 |
+
$verbose = shift @ARGV;
|
56 |
+
} else {
|
57 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
$sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
|
62 |
+
print STDERR "Loaded resources.\n" if $verbose;
|
63 |
+
|
64 |
+
my $chart_id = 0;
|
65 |
+
my $line_number = 0;
|
66 |
+
print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
|
67 |
+
while (<>) {
|
68 |
+
$line_number++;
|
69 |
+
if ($verbose) {
|
70 |
+
if ($line_number =~ /000$/) {
|
71 |
+
if ($line_number =~ /0000$/) {
|
72 |
+
print STDERR $line_number;
|
73 |
+
} else {
|
74 |
+
print STDERR ".";
|
75 |
+
}
|
76 |
+
}
|
77 |
+
}
|
78 |
+
my $line = $_;
|
79 |
+
$line =~ s/^\xEF\xBB\xBF//;
|
80 |
+
next if $line =~ /^\s*(\#.*)?$/;
|
81 |
+
my $s1;
|
82 |
+
my $s2;
|
83 |
+
if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
|
84 |
+
$s1 = $util->dequote_string($s1);
|
85 |
+
$s2 = $util->dequote_string($s2);
|
86 |
+
} elsif ($line =~ /^\s*(#.*)$/) {
|
87 |
+
} else {
|
88 |
+
print STDERR "Could not process line $line_number: $line" if $verbose;
|
89 |
+
print "\n";
|
90 |
+
next;
|
91 |
+
}
|
92 |
+
|
93 |
+
$cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
|
94 |
+
print "$s1\t$s2\t$cost\n";
|
95 |
+
}
|
96 |
+
print STDERR "\n" if $verbose;
|
97 |
+
|
98 |
+
exit 0;
|
99 |
+
|
uroman/bin/uroman-quick.pl
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# uroman Nov. 12, 2015 - July 25, 2016
|
4 |
+
# version v0.7
|
5 |
+
# Author: Ulf Hermjakob
|
6 |
+
|
7 |
+
# Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
|
8 |
+
# currently only for Arabic script languages, incl. Uyghur
|
9 |
+
|
10 |
+
$|=1;
|
11 |
+
|
12 |
+
use FindBin;
|
13 |
+
use Cwd "abs_path";
|
14 |
+
use File::Basename qw(dirname);
|
15 |
+
use File::Spec;
|
16 |
+
|
17 |
+
my $bin_dir = abs_path(dirname($0));
|
18 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
19 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
20 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
21 |
+
|
22 |
+
use lib "$FindBin::Bin/../lib";
|
23 |
+
use NLP::Romanizer;
|
24 |
+
use NLP::UTF8;
|
25 |
+
$romanizer = NLP::Romanizer;
|
26 |
+
%ht = ();
|
27 |
+
$lang_code = "";
|
28 |
+
|
29 |
+
while (@ARGV) {
|
30 |
+
$arg = shift @ARGV;
|
31 |
+
if ($arg =~ /^-+(l|lc|lang-code)$/) {
|
32 |
+
$lang_code = lc (shift @ARGV || "")
|
33 |
+
} else {
|
34 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
35 |
+
}
|
36 |
+
}
|
37 |
+
|
38 |
+
$romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
|
39 |
+
$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
|
40 |
+
|
41 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
|
42 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_filename);
|
43 |
+
|
44 |
+
$line_number = 0;
|
45 |
+
while (<>) {
|
46 |
+
$line_number++;
|
47 |
+
my $line = $_;
|
48 |
+
print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
|
49 |
+
if ($line_number =~ /0000$/) {
|
50 |
+
print STDERR $line_number;
|
51 |
+
} elsif ($line_number =~ /000$/) {
|
52 |
+
print STDERR ".";
|
53 |
+
}
|
54 |
+
}
|
55 |
+
print STDERR "\n";
|
56 |
+
|
57 |
+
exit 0;
|
58 |
+
|
uroman/bin/uroman-tsv.sh
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
# Created by Thamme Gowda on June 17, 2019
|
3 |
+
|
4 |
+
DIR=$(dirname "${BASH_SOURCE[0]}") # get the directory name
|
5 |
+
# DIR=$(realpath "${DIR}") # resolve its full path if need be
|
6 |
+
|
7 |
+
if [[ $# -lt 1 || $# -gt 2 ]]; then
|
8 |
+
>&2 echo "ERROR: invalid args"
|
9 |
+
>&2 echo "Usage: <input.tsv> [<output.tsv>]"
|
10 |
+
exit 2
|
11 |
+
fi
|
12 |
+
|
13 |
+
INP=$1
|
14 |
+
OUT=$2
|
15 |
+
|
16 |
+
CMD=$DIR/uroman.pl
|
17 |
+
|
18 |
+
function romanize(){
|
19 |
+
paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
|
20 |
+
}
|
21 |
+
|
22 |
+
if [[ -n $OUT ]]; then
|
23 |
+
romanize > $OUT
|
24 |
+
else
|
25 |
+
romanize
|
26 |
+
fi
|
27 |
+
|
28 |
+
|
uroman/bin/uroman.pl
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# uroman Nov. 12, 2015 - Apr. 23, 2021
|
4 |
+
$version = "v1.2.8";
|
5 |
+
# Author: Ulf Hermjakob
|
6 |
+
|
7 |
+
# Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
|
8 |
+
# Example: cat workset.txt | uroman.pl --offset-mapping --workset
|
9 |
+
|
10 |
+
$|=1;
|
11 |
+
|
12 |
+
use FindBin;
|
13 |
+
use Cwd "abs_path";
|
14 |
+
use File::Basename qw(dirname);
|
15 |
+
use File::Spec;
|
16 |
+
|
17 |
+
my $bin_dir = abs_path(dirname($0));
|
18 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
19 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
20 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
21 |
+
|
22 |
+
use lib "$FindBin::Bin/../lib";
|
23 |
+
use NLP::Chinese;
|
24 |
+
use NLP::Romanizer;
|
25 |
+
use NLP::UTF8;
|
26 |
+
use NLP::utilities;
|
27 |
+
use JSON;
|
28 |
+
$chinesePM = NLP::Chinese;
|
29 |
+
$romanizer = NLP::Romanizer;
|
30 |
+
$util = NLP::utilities;
|
31 |
+
%ht = ();
|
32 |
+
%pinyin_ht = ();
|
33 |
+
$lang_code = "";
|
34 |
+
$return_chart_p = 0;
|
35 |
+
$return_offset_mappings_p = 0;
|
36 |
+
$workset_p = 0;
|
37 |
+
$cache_rom_tokens_p = 1;
|
38 |
+
|
39 |
+
$script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
|
40 |
+
$unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
|
41 |
+
$unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
|
42 |
+
$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
|
43 |
+
$chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
|
44 |
+
|
45 |
+
while (@ARGV) {
|
46 |
+
$arg = shift @ARGV;
|
47 |
+
if ($arg =~ /^-+(l|lc|lang-code)$/) {
|
48 |
+
$lang_code = lc (shift @ARGV || "")
|
49 |
+
} elsif ($arg =~ /^-+chart$/i) {
|
50 |
+
$return_chart_p = 1;
|
51 |
+
} elsif ($arg =~ /^-+workset$/i) {
|
52 |
+
$workset_p = 1;
|
53 |
+
} elsif ($arg =~ /^-+offset[-_]*map/i) {
|
54 |
+
$return_offset_mappings_p = 1;
|
55 |
+
} elsif ($arg =~ /^-+unicode[-_]?data/i) {
|
56 |
+
$filename = shift @ARGV;
|
57 |
+
if (-r $filename) {
|
58 |
+
$unicode_data_filename = $filename;
|
59 |
+
} else {
|
60 |
+
print STDERR "Ignoring invalid UnicodeData filename $filename\n";
|
61 |
+
}
|
62 |
+
} elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
|
63 |
+
$cache_rom_tokens_p = 0;
|
64 |
+
} else {
|
65 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
$romanizer->load_script_data(*ht, $script_data_filename);
|
70 |
+
$romanizer->load_unicode_data(*ht, $unicode_data_filename);
|
71 |
+
$romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
|
72 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_filename);
|
73 |
+
$chinese_to_pinyin_not_yet_loaded_p = 1;
|
74 |
+
$current_date = $util->datetime("dateTtime");
|
75 |
+
$lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
|
76 |
+
|
77 |
+
print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
|
78 |
+
my $line_number = 0;
|
79 |
+
my $chart_result = "";
|
80 |
+
while (<>) {
|
81 |
+
$line_number++;
|
82 |
+
my $line = $_;
|
83 |
+
my $snt_id = "";
|
84 |
+
if ($workset_p) {
|
85 |
+
next if $line =~ /^#/;
|
86 |
+
if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
|
87 |
+
$snt_id = $i_value;
|
88 |
+
$line = "$s_value\n";
|
89 |
+
} else {
|
90 |
+
next;
|
91 |
+
}
|
92 |
+
}
|
93 |
+
if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
|
94 |
+
$chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
|
95 |
+
$chinese_to_pinyin_not_yet_loaded_p = 0;
|
96 |
+
}
|
97 |
+
if ($return_chart_p) {
|
98 |
+
print $chart_result;
|
99 |
+
*chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
|
100 |
+
$chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
|
101 |
+
} elsif ($return_offset_mappings_p) {
|
102 |
+
($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
|
103 |
+
print "::snt-id $snt_id\n" if $workset_p;
|
104 |
+
print "::orig $line";
|
105 |
+
print "::rom $best_romanization\n";
|
106 |
+
print "::align $offset_mappings\n\n";
|
107 |
+
} elsif ($cache_rom_tokens_p) {
|
108 |
+
print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
|
109 |
+
} else {
|
110 |
+
print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
|
111 |
+
}
|
112 |
+
}
|
113 |
+
$chart_result =~ s/,(\s*)$/$1/;
|
114 |
+
print $chart_result;
|
115 |
+
print " ]\n}\n" if $return_chart_p;
|
116 |
+
|
117 |
+
$dev_test_p = 0;
|
118 |
+
if ($dev_test_p) {
|
119 |
+
$n_suspicious_code_points = 0;
|
120 |
+
$n_instances = 0;
|
121 |
+
foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
|
122 |
+
keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
|
123 |
+
$unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
|
124 |
+
$utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
|
125 |
+
foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
|
126 |
+
$count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
|
127 |
+
$s = ($count == 1) ? "" : "s";
|
128 |
+
print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
|
129 |
+
print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
|
130 |
+
$n_suspicious_code_points++;
|
131 |
+
$n_instances += $count;
|
132 |
+
}
|
133 |
+
}
|
134 |
+
print STDERR " *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
|
135 |
+
}
|
136 |
+
|
137 |
+
exit 0;
|
138 |
+
|
uroman/data/Chinese_to_Pinyin.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/data/Scripts.txt
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::script-name Aegean
|
2 |
+
::script-name Ahom
|
3 |
+
::script-name Anatolian Hieroglyph
|
4 |
+
::script-name Arabic ::direction right-to-left
|
5 |
+
::script-name Armenian
|
6 |
+
::script-name Avestan
|
7 |
+
::script-name Balinese
|
8 |
+
::script-name Bamum
|
9 |
+
::script-name Bassa Vah
|
10 |
+
::script-name Batak
|
11 |
+
::script-name Bengali ::abugida-default-vowel a
|
12 |
+
::script-name Bhaiksuki
|
13 |
+
::script-name Bopomofo ::language Chinese
|
14 |
+
::script-name Brahmi ::abugida-default-vowel a
|
15 |
+
::script-name Braille
|
16 |
+
::script-name Buginese
|
17 |
+
::script-name Buhid
|
18 |
+
::script-name Canadian Syllabics
|
19 |
+
::script-name Carian
|
20 |
+
::script-name Caucasian Albanian
|
21 |
+
::script-name Chakma
|
22 |
+
::script-name Cham
|
23 |
+
::script-name Cherokee
|
24 |
+
::script-name Coptic
|
25 |
+
::script-name Cuneiform
|
26 |
+
::script-name Cypriot
|
27 |
+
::script-name Cyrillic
|
28 |
+
::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
|
29 |
+
::script-name Deseret
|
30 |
+
::script-name Devanagari ::abugida-default-vowel a
|
31 |
+
::script-name Duployan
|
32 |
+
::script-name Egyptian Hieroglyph
|
33 |
+
::script-name Elbasan
|
34 |
+
::script-name Ethiopic
|
35 |
+
::script-name Georgian
|
36 |
+
::script-name Glagolitic
|
37 |
+
::script-name Gothic
|
38 |
+
::script-name Grantha
|
39 |
+
::script-name Greek
|
40 |
+
::script-name Gujarati ::abugida-default-vowel a
|
41 |
+
::script-name Gurmukhi ::abugida-default-vowel a
|
42 |
+
::script-name Hangul ::language Korean
|
43 |
+
::script-name Hanunoo
|
44 |
+
::script-name Hatran
|
45 |
+
::script-name Hebrew ::direction right-to-left
|
46 |
+
::script-name Hiragana ::language Japanese
|
47 |
+
::script-name Imperial Aramaic
|
48 |
+
::script-name Inscriptional Pahlavi
|
49 |
+
::script-name Inscriptional Parthian
|
50 |
+
::script-name Javanese
|
51 |
+
::script-name Kaithi
|
52 |
+
::script-name Kannada ::abugida-default-vowel a
|
53 |
+
::script-name Katakana ::language Japanese
|
54 |
+
::script-name Kayah Li
|
55 |
+
::script-name Kharoshthi
|
56 |
+
::script-name Khmer ::abugida-default-vowel a, o
|
57 |
+
::script-name Khojki
|
58 |
+
::script-name Khudawadi
|
59 |
+
::script-name Klingon
|
60 |
+
::script-name Lao
|
61 |
+
::script-name Lepcha
|
62 |
+
::script-name Latin
|
63 |
+
::script-name Limbu
|
64 |
+
::script-name Linear A
|
65 |
+
::script-name Linear B
|
66 |
+
::script-name Lycian
|
67 |
+
::script-name Lydian
|
68 |
+
::script-name Mahajani
|
69 |
+
::script-name Malayalam ::abugida-default-vowel a
|
70 |
+
::script-name Mandaic
|
71 |
+
::script-name Manichaean
|
72 |
+
::script-name Marchen
|
73 |
+
::script-name Meetei Mayek
|
74 |
+
::script-name Meroitic Cursive
|
75 |
+
::script-name Meroitic Hieroglyphic
|
76 |
+
::script-name Miao
|
77 |
+
::script-name Modi ::abugida-default-vowel a
|
78 |
+
::script-name Mongolian
|
79 |
+
::script-name Mro
|
80 |
+
::script-name Multani
|
81 |
+
::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
|
82 |
+
::script-name Nabataean
|
83 |
+
::script-name New Tai Lue
|
84 |
+
::script-name Newa
|
85 |
+
::script-name Nko ::direction right-to-left
|
86 |
+
::script-name Ogham
|
87 |
+
::script-name Ol Chiki
|
88 |
+
::script-name Old Hungarian
|
89 |
+
::script-name Old Italic
|
90 |
+
::script-name Old Permic
|
91 |
+
::script-name Old Persian
|
92 |
+
::script-name Old North Arabian
|
93 |
+
::script-name Old South Arabian
|
94 |
+
::script-name Old Turkic
|
95 |
+
::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
|
96 |
+
::script-name Osage
|
97 |
+
::script-name Osmanya
|
98 |
+
::script-name Pahawh Hmong
|
99 |
+
::script-name Palmyrene
|
100 |
+
::script-name Pau Cin Hau
|
101 |
+
::script-name Phags-pa
|
102 |
+
::script-name Phaistos Disc
|
103 |
+
::script-name Phoenician
|
104 |
+
::script-name Psalter Pahlavi
|
105 |
+
::script-name Rejang
|
106 |
+
::script-name Runic
|
107 |
+
::script-name Samaritan
|
108 |
+
::script-name Saurashtra
|
109 |
+
::script-name Sharada
|
110 |
+
::script-name Shavian
|
111 |
+
::script-name Siddham
|
112 |
+
::script-name Sinhala ::abugida-default-vowel a
|
113 |
+
::script-name Sora Sompeng
|
114 |
+
::script-name Sundanese ::abugida-default-vowel a
|
115 |
+
::script-name Syloti Nagri
|
116 |
+
::script-name Syriac
|
117 |
+
::script-name Tagalog
|
118 |
+
::script-name Tagbanwa
|
119 |
+
::script-name Tai Le
|
120 |
+
::script-name Tai Tham
|
121 |
+
::script-name Tai Viet
|
122 |
+
::script-name Takri
|
123 |
+
::script-name Tamil ::abugida-default-vowel a
|
124 |
+
::script-name Tangut
|
125 |
+
::script-name Telugu ::abugida-default-vowel a
|
126 |
+
::script-name Thaana ::direction right-to-left
|
127 |
+
::script-name Thai
|
128 |
+
::script-name Tibetan ::abugida-default-vowel a
|
129 |
+
::script-name Tifinagh
|
130 |
+
::script-name Tirhuta
|
131 |
+
::script-name Ugaritic
|
132 |
+
::script-name Vai
|
133 |
+
::script-name Vedic
|
134 |
+
::script-name Warang Citi
|
135 |
+
::script-name Yi
|
uroman/data/UnicodeData.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/data/UnicodeDataOverwrite.txt
ADDED
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## UnicodeDataOverwrite.txt
|
2 |
+
::u 00A0 ::r " " ::comment no-break space
|
3 |
+
::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English)
|
4 |
+
::u 0294 ::r ' ::comment gottal stop
|
5 |
+
::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
|
6 |
+
::u 0305 ::r "" ::comment ̅ Combining overline
|
7 |
+
::u 0306 ::r "" ::comment ̆ Combining breve
|
8 |
+
::u 0307 ::r "" ::comment ̇ Combining dot above
|
9 |
+
::u 030A ::r "" ::comment ̊ Combining ring above
|
10 |
+
::u 030C ::r "" ::comment ̌ Combining caron
|
11 |
+
::u 0311 ::r "" ::comment ̑ Combining inverted breve
|
12 |
+
::u 031D ::r "" ::comment ̝ Combining down up below
|
13 |
+
::u 031E ::r "" ::comment ̞ Combining down tack below
|
14 |
+
::u 031F ::r "" ::comment ̟ Combining plus sign below
|
15 |
+
::u 0323 ::r "" ::comment ̣ Combining dot below
|
16 |
+
::u 0325 ::r "" ::comment ̥ Combining ring below
|
17 |
+
::u 0329 ::r "" ::comment ̩ Combining vertical line below
|
18 |
+
::u 032A ::r "" ::comment ̪ Combining bridge below
|
19 |
+
::u 032F ::r "" ::comment ̯ Combining inverted breve below
|
20 |
+
::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent)
|
21 |
+
::u 0343 ::r "" ::comment ̓ Combining Greek koronis
|
22 |
+
::u 0361 ::r "" ::comment Combining double inverted breve
|
23 |
+
::u 0384 ::r "" ::comment ΄ Greek tonos
|
24 |
+
::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign
|
25 |
+
::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
|
26 |
+
::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
|
27 |
+
::u 055B ::r "" ::comment ՛ Armenian emphasis mark
|
28 |
+
::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
|
29 |
+
|
30 |
+
::u 0901 ::r +m ::comment Devanagari sign candrabindu
|
31 |
+
::u 0902 ::r +m ::comment Devanagari sign anusvara
|
32 |
+
::u 0903 ::r +h ::comment Devanagari sign visarga
|
33 |
+
::u 093D ::r ' ::comment Devanagari sign avagraha
|
34 |
+
::u 0950 ::r om ::comment ॐ Devanagari om symbol
|
35 |
+
::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta"
|
36 |
+
::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta"
|
37 |
+
::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu")
|
38 |
+
::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
|
39 |
+
::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô")
|
40 |
+
::u 099A ::r ch ::comment instead of Bengali C(A)
|
41 |
+
::u 099B ::r chh ::comment instead of Bengali CC(A)
|
42 |
+
::u 0A02 ::r +m ::comment Gurmukhi sign bindi
|
43 |
+
::u 0A70 ::r +m ::comment Gurmukhi tippi
|
44 |
+
# ::u 0A72 ::r "" ::comment Gurmukhi addak
|
45 |
+
::u 0A72 ::r "" ::comment Gurmukhi iri
|
46 |
+
::u 0A73 ::r "" ::comment Gurmukhi ura
|
47 |
+
::u 0B01 ::r +m ::comment Oriya sign candrabindu
|
48 |
+
::u 0B03 ::r +h ::comment Oriya sign visarga
|
49 |
+
::u 0B5F ::r ya ::comment ୟ Oriya letter yya
|
50 |
+
::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?)
|
51 |
+
::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam")
|
52 |
+
::u 0B9F ::r t ::comment instead of Tamil TT(A)
|
53 |
+
::u 0BA3 ::r n ::comment instead of Tamil NN(A)
|
54 |
+
::u 0BA9 ::r n ::comment instead of Tamil NNN(A)
|
55 |
+
::u 0BB1 ::r r ::comment instead of Tamil RR(A)
|
56 |
+
::u 0BB3 ::r l ::comment instead of Tamil LL(A)
|
57 |
+
::u 0BB4 ::r l ::comment instead of Tamil LLL(A)
|
58 |
+
::u 0C03 ::r +h ::comment ః Telugu sign visarga
|
59 |
+
::u 0C83 ::r +h ::comment Kannada sign visarga
|
60 |
+
::u 0D02 ::r +m ::comment Malayalam sign anusvara
|
61 |
+
::u 0D03 ::r +h ::comment Malayalam sign visarga
|
62 |
+
::u 0D82 ::r +n ::comment Sinhala sign anusvaraya
|
63 |
+
::u 0DA4 ::r ny ::comment Sinhala ඤ
|
64 |
+
::u 0DA5 ::r gn ::comment Sinhala ඥ
|
65 |
+
::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel)
|
66 |
+
::u 0DCF ::r aa ::comment Sinhala ා
|
67 |
+
::u 0DD0 ::r ae ::comment Sinhala ැ
|
68 |
+
::u 0DD1 ::r ae ::comment Sinhala ෑ
|
69 |
+
::u 0DD2 ::r i ::comment Sinhala ි
|
70 |
+
::u 0DD3 ::r ii ::comment Sinhala ී
|
71 |
+
::u 0DD4 ::r u ::comment Sinhala ු
|
72 |
+
::u 0DD6 ::r uu ::comment Sinhala ූ
|
73 |
+
::u 0DD8 ::r r ::comment Sinhala ෘ
|
74 |
+
::u 0DD9 ::r e ::comment Sinhala ෙ
|
75 |
+
::u 0DDA ::r ee ::comment Sinhala ේ
|
76 |
+
::u 0DDB ::r ai ::comment Sinhala ෛ
|
77 |
+
::u 0DDC ::r o ::comment Sinhala ො
|
78 |
+
::u 0DDD ::r oo ::comment Sinhala ෝ
|
79 |
+
::u 0DDE ::r au ::comment Sinhala ෞ
|
80 |
+
::u 0DDF ::r aa ::comment Sinhala ා
|
81 |
+
::u 0DF2 ::r rr ::comment Sinhala ෲ
|
82 |
+
|
83 |
+
::u 0E02 ::r k ::comment Thai character KHO KHAI
|
84 |
+
::u 0E03 ::r k ::comment Thai character KHO KHUAT
|
85 |
+
::u 0E04 ::r k ::comment Thai character KHO KHWAI
|
86 |
+
::u 0E05 ::r k ::comment Thai character KHO KHON
|
87 |
+
::u 0E06 ::r k ::comment Thai character KHO RAKHANG
|
88 |
+
::u 0E10 ::r t ::comment Thai character THO THAN
|
89 |
+
::u 0E11 ::r t ::comment Thai character THO NANGMONTHO
|
90 |
+
::u 0E12 ::r t ::comment Thai character THO PHUTHAO
|
91 |
+
::u 0E16 ::r t ::comment Thai character THO THUNG
|
92 |
+
::u 0E17 ::r t ::comment Thai character THO THAHAN
|
93 |
+
::u 0E18 ::r t ::comment Thai character THO THONG
|
94 |
+
::u 0E1C ::r p ::comment Thai character PHO PHUNG
|
95 |
+
::u 0E1E ::r p ::comment Thai character PHO PHAN
|
96 |
+
::u 0E20 ::r p ::comment Thai character PHO SAMPHAO
|
97 |
+
::u 0E2D ::r o ::comment Thai character O ANG
|
98 |
+
::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
|
99 |
+
::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT
|
100 |
+
::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama)
|
101 |
+
::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
|
102 |
+
::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
|
103 |
+
::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
|
104 |
+
::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
|
105 |
+
::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
|
106 |
+
::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener
|
107 |
+
::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener
|
108 |
+
::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK
|
109 |
+
::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO
|
110 |
+
::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI
|
111 |
+
::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
|
112 |
+
::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
|
113 |
+
::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
|
114 |
+
::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation
|
115 |
+
::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg
|
116 |
+
::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar
|
117 |
+
::u 0F84 ::r "" ::comment ྄ Tibetan halanta
|
118 |
+
::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit")
|
119 |
+
::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below
|
120 |
+
::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga
|
121 |
+
|
122 |
+
::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
|
123 |
+
::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
|
124 |
+
::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
|
125 |
+
::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
|
126 |
+
::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
|
127 |
+
::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
|
128 |
+
::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
|
129 |
+
::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
|
130 |
+
::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
|
131 |
+
::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
|
132 |
+
::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
|
133 |
+
::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
|
134 |
+
::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
|
135 |
+
::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
|
136 |
+
::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
|
137 |
+
::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
|
138 |
+
::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
|
139 |
+
::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
|
140 |
+
::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
|
141 |
+
::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
|
142 |
+
::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
|
143 |
+
::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
|
144 |
+
::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
|
145 |
+
::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
|
146 |
+
::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
|
147 |
+
::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
|
148 |
+
::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
|
149 |
+
::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
|
150 |
+
::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
|
151 |
+
::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
|
152 |
+
::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
|
153 |
+
::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
|
154 |
+
::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
|
155 |
+
::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
|
156 |
+
::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
|
157 |
+
::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
|
158 |
+
::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
|
159 |
+
::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
|
160 |
+
::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
|
161 |
+
::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
|
162 |
+
::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
|
163 |
+
::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
|
164 |
+
::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
|
165 |
+
::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
|
166 |
+
::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
|
167 |
+
::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
|
168 |
+
::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
|
169 |
+
::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
|
170 |
+
::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
|
171 |
+
::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
|
172 |
+
::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
|
173 |
+
::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
|
174 |
+
::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
|
175 |
+
::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
|
176 |
+
::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
|
177 |
+
::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
|
178 |
+
::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
|
179 |
+
::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
|
180 |
+
::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
|
181 |
+
::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
|
182 |
+
::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
|
183 |
+
::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
|
184 |
+
::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
|
185 |
+
::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
|
186 |
+
::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
|
187 |
+
::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
|
188 |
+
::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
|
189 |
+
::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
|
190 |
+
::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
|
191 |
+
::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
|
192 |
+
::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
|
193 |
+
::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
|
194 |
+
::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
|
195 |
+
::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
|
196 |
+
::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
|
197 |
+
|
198 |
+
::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
|
199 |
+
::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
|
200 |
+
::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara)
|
201 |
+
::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga)
|
202 |
+
::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
|
203 |
+
::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first
|
204 |
+
::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second
|
205 |
+
::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener)
|
206 |
+
::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
|
207 |
+
::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter
|
208 |
+
|
209 |
+
::u 180E ::r ' ::comment Mongolian vowel separator
|
210 |
+
|
211 |
+
::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek
|
212 |
+
::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar
|
213 |
+
::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad
|
214 |
+
::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal
|
215 |
+
::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr
|
216 |
+
::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku
|
217 |
+
::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu
|
218 |
+
::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku
|
219 |
+
::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng
|
220 |
+
::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong
|
221 |
+
::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet
|
222 |
+
::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung
|
223 |
+
::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
|
224 |
+
|
225 |
+
::u 1FBD ::r "" ::comment ᾽ Greek koronis
|
226 |
+
::u 1FFE ::r "" ::comment Greek dasia (rough breathing)
|
227 |
+
|
228 |
+
::u 2002 ::r " " ::comment en space
|
229 |
+
::u 2003 ::r " " ::comment em space
|
230 |
+
::u 2004 ::r " " ::comment three-per-em space
|
231 |
+
::u 2005 ::r " " ::comment four-per-em space
|
232 |
+
::u 2006 ::r " " ::comment six-per-em space
|
233 |
+
::u 2007 ::r " " ::comment figure space
|
234 |
+
::u 2008 ::r " " ::comment punctuation space
|
235 |
+
::u 2009 ::r " " ::comment thin space
|
236 |
+
::u 200A ::r " " ::comment hair space
|
237 |
+
::u 202F ::r " " ::comment narrow no-break space
|
238 |
+
|
239 |
+
::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
|
240 |
+
::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
|
241 |
+
::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
|
242 |
+
::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
|
243 |
+
::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
|
244 |
+
::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
|
245 |
+
::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
|
246 |
+
::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
|
247 |
+
::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
|
248 |
+
::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
|
249 |
+
::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
|
250 |
+
::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
|
251 |
+
::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
|
252 |
+
::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
|
253 |
+
::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
|
254 |
+
::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
|
255 |
+
::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
|
256 |
+
::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
|
257 |
+
::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
|
258 |
+
::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
|
259 |
+
::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
|
260 |
+
::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
|
261 |
+
::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
|
262 |
+
::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
|
263 |
+
::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
|
264 |
+
::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
|
265 |
+
::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
|
266 |
+
::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
|
267 |
+
::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
|
268 |
+
::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
|
269 |
+
::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
|
270 |
+
::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
|
271 |
+
::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
|
272 |
+
::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
|
273 |
+
::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
|
274 |
+
::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
|
275 |
+
::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
|
276 |
+
::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
|
277 |
+
::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
|
278 |
+
::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
|
279 |
+
::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
|
280 |
+
::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
|
281 |
+
::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
|
282 |
+
::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
|
283 |
+
::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
|
284 |
+
::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
|
285 |
+
::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
|
286 |
+
::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
|
287 |
+
::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
|
288 |
+
::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
|
289 |
+
::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
|
290 |
+
::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
|
291 |
+
::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
|
292 |
+
::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
|
293 |
+
::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
|
294 |
+
::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
|
295 |
+
::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
|
296 |
+
::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
|
297 |
+
::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
|
298 |
+
|
299 |
+
::u 3063 ::r tsu ::comment Hiragana letter small tsu
|
300 |
+
::u 30C3 ::r tsu ::comment Katakana letter small tsu
|
301 |
+
|
302 |
+
::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap
|
303 |
+
::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap
|
304 |
+
|
305 |
+
::u F008 ::r "" ::comment Yoruba diacritic in private use area
|
306 |
+
::u F00F ::r "" ::comment Yoruba diacritic in private use area
|
307 |
+
::u F023 ::r "" ::comment Yoruba diacritic in private use area
|
308 |
+
::u F025 ::r "" ::comment Yoruba diacritic in private use area
|
309 |
+
|
310 |
+
::u F8D0 ::r a ::name KLINGON LETTER A
|
311 |
+
::u F8D1 ::r b ::name KLINGON LETTER B
|
312 |
+
::u F8D2 ::r ch ::name KLINGON LETTER CH
|
313 |
+
::u F8D3 ::r D ::name KLINGON LETTER D
|
314 |
+
::u F8D4 ::r e ::name KLINGON LETTER E
|
315 |
+
::u F8D5 ::r gh ::name KLINGON LETTER GH
|
316 |
+
::u F8D6 ::r H ::name KLINGON LETTER H
|
317 |
+
::u F8D7 ::r I ::name KLINGON LETTER I
|
318 |
+
::u F8D8 ::r j ::name KLINGON LETTER J
|
319 |
+
::u F8D9 ::r l ::name KLINGON LETTER L
|
320 |
+
::u F8DA ::r m ::name KLINGON LETTER M
|
321 |
+
::u F8DB ::r n ::name KLINGON LETTER N
|
322 |
+
::u F8DC ::r ng ::name KLINGON LETTER NG
|
323 |
+
::u F8DD ::r o ::name KLINGON LETTER O
|
324 |
+
::u F8DE ::r p ::name KLINGON LETTER P
|
325 |
+
::u F8DF ::r q ::name KLINGON LETTER Q
|
326 |
+
::u F8E0 ::r Q ::name KLINGON LETTER Q
|
327 |
+
::u F8E1 ::r r ::name KLINGON LETTER R
|
328 |
+
::u F8E2 ::r S ::name KLINGON LETTER S
|
329 |
+
::u F8E3 ::r t ::name KLINGON LETTER T
|
330 |
+
::u F8E4 ::r tlh ::name KLINGON LETTER TLH
|
331 |
+
::u F8E5 ::r u ::name KLINGON LETTER U
|
332 |
+
::u F8E6 ::r v ::name KLINGON LETTER V
|
333 |
+
::u F8E7 ::r w ::name KLINGON LETTER W
|
334 |
+
::u F8E8 ::r y ::name KLINGON LETTER Y
|
335 |
+
::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP
|
336 |
+
::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
|
337 |
+
::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
|
338 |
+
::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
|
339 |
+
::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
|
340 |
+
::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
|
341 |
+
::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
|
342 |
+
::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
|
343 |
+
::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
|
344 |
+
::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
|
345 |
+
::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
|
346 |
+
::u F8FD ::r , ::name KLINGON COMMA
|
347 |
+
::u F8FE ::r . ::name KLINGON FULL STOP
|
348 |
+
::u F8FF ::name KLINGON MUMMIFICATION GLYPH
|
349 |
+
|
350 |
+
::u 1163D ::r +m ::comment Modi sign anusvara
|
351 |
+
::u 1163E ::r +h ::comment Modi sign visarga
|
352 |
+
|
353 |
+
::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
|
354 |
+
::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
|
355 |
+
::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
|
356 |
+
::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
|
357 |
+
::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
|
358 |
+
::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
|
359 |
+
::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
|
360 |
+
::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
|
361 |
+
::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
|
362 |
+
::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
|
363 |
+
::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
|
364 |
+
::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
|
365 |
+
::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
|
366 |
+
::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
|
367 |
+
::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
|
368 |
+
::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
|
369 |
+
::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
|
370 |
+
::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
|
371 |
+
::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
|
372 |
+
::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
|
373 |
+
::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
|
374 |
+
::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
|
375 |
+
::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
|
376 |
+
::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
|
377 |
+
::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
|
378 |
+
::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
|
379 |
+
::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
|
380 |
+
::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
|
381 |
+
::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
|
382 |
+
::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
|
383 |
+
::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
|
384 |
+
::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
|
385 |
+
::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
|
386 |
+
::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
|
387 |
+
::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
|
388 |
+
::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
|
389 |
+
::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
|
390 |
+
::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
|
391 |
+
::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
|
392 |
+
::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
|
393 |
+
::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
|
394 |
+
::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
|
395 |
+
::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
|
396 |
+
::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
|
397 |
+
::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
|
398 |
+
::u 13362 ::num 100 ::comment Egyptian Hieroglyph
|
399 |
+
::u 13363 ::num 200 ::comment Egyptian Hieroglyph
|
400 |
+
::u 13364 ::num 300 ::comment Egyptian Hieroglyph
|
401 |
+
::u 13365 ::num 400 ::comment Egyptian Hieroglyph
|
402 |
+
::u 13366 ::num 500 ::comment Egyptian Hieroglyph
|
403 |
+
::u 13367 ::num 600 ::comment Egyptian Hieroglyph
|
404 |
+
::u 13368 ::num 700 ::comment Egyptian Hieroglyph
|
405 |
+
::u 13369 ::num 800 ::comment Egyptian Hieroglyph
|
406 |
+
::u 1336A ::num 900 ::comment Egyptian Hieroglyph
|
407 |
+
::u 1336B ::num 500 ::comment Egyptian Hieroglyph
|
408 |
+
::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
|
409 |
+
::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
|
410 |
+
::u 13386 ::num 10 ::comment Egyptian Hieroglyph
|
411 |
+
::u 13387 ::num 20 ::comment Egyptian Hieroglyph
|
412 |
+
::u 13388 ::num 30 ::comment Egyptian Hieroglyph
|
413 |
+
::u 13389 ::num 40 ::comment Egyptian Hieroglyph
|
414 |
+
::u 1338A ::num 50 ::comment Egyptian Hieroglyph
|
415 |
+
::u 1338B ::num 60 ::comment Egyptian Hieroglyph
|
416 |
+
::u 1338C ::num 70 ::comment Egyptian Hieroglyph
|
417 |
+
::u 1338D ::num 80 ::comment Egyptian Hieroglyph
|
418 |
+
::u 1338E ::num 90 ::comment Egyptian Hieroglyph
|
419 |
+
::u 1338F ::num 20 ::comment Egyptian Hieroglyph
|
420 |
+
::u 13390 ::num 30 ::comment Egyptian Hieroglyph
|
421 |
+
::u 13391 ::num 40 ::comment Egyptian Hieroglyph
|
422 |
+
::u 13392 ::num 50 ::comment Egyptian Hieroglyph
|
423 |
+
::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
|
424 |
+
::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
|
425 |
+
::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
|
426 |
+
::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
|
427 |
+
::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
|
428 |
+
::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
|
429 |
+
::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
|
430 |
+
::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
|
431 |
+
::u 133FA ::num 1 ::comment Egyptian Hieroglyph
|
432 |
+
::u 133FB ::num 2 ::comment Egyptian Hieroglyph
|
433 |
+
::u 133FC ::num 3 ::comment Egyptian Hieroglyph
|
434 |
+
::u 133FD ::num 4 ::comment Egyptian Hieroglyph
|
435 |
+
::u 133FE ::num 5 ::comment Egyptian Hieroglyph
|
436 |
+
::u 133FF ::num 6 ::comment Egyptian Hieroglyph
|
437 |
+
::u 13400 ::num 7 ::comment Egyptian Hieroglyph
|
438 |
+
::u 13401 ::num 8 ::comment Egyptian Hieroglyph
|
439 |
+
::u 13402 ::num 9 ::comment Egyptian Hieroglyph
|
440 |
+
::u 13403 ::num 5 ::comment Egyptian Hieroglyph
|
441 |
+
::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
|
442 |
+
::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)
|
uroman/data/romanization-table-arabic-block.txt
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::s ، ::t , ::comment ARABIC COMMA
|
2 |
+
::s ؛ ::t ; ::comment ARABIC SEMICOLON
|
3 |
+
::s ؟ ::t ? ::comment ARABIC QUESTION MARK
|
4 |
+
::s ء ::t ' ::comment ARABIC LETTER HAMZA
|
5 |
+
::s آ ::t a ::comment ARABIC LETTER ALEF WITH MADDA ABOVE
|
6 |
+
::s أ ::t a ::comment ARABIC LETTER ALEF WITH HAMZA ABOVE
|
7 |
+
::s ؤ ::t w ::comment ARABIC LETTER WAW WITH HAMZA ABOVE
|
8 |
+
::s إ ::t i ::comment ARABIC LETTER ALEF WITH HAMZA BELOW
|
9 |
+
::s ئ ::t ye ::comment ARABIC LETTER YEH WITH HAMZA ABOVE
|
10 |
+
::s ا ::t a ::comment ARABIC LETTER ALEF
|
11 |
+
::s ب ::t b ::comment ARABIC LETTER BEH
|
12 |
+
::s ة ::t a ::comment ARABIC LETTER TEH MARBUTA
|
13 |
+
::s ت ::t t ::comment ARABIC LETTER TEH
|
14 |
+
::s ث ::t th ::comment ARABIC LETTER THEH
|
15 |
+
::s ج ::t j ::comment ARABIC LETTER JEEM
|
16 |
+
::s ح ::t h ::comment ARABIC LETTER HAH
|
17 |
+
::s خ ::t kh ::comment ARABIC LETTER KHAH
|
18 |
+
::s د ::t d ::comment ARABIC LETTER DAL
|
19 |
+
::s ذ ::t th ::comment ARABIC LETTER THAL
|
20 |
+
::s ر ::t r ::comment ARABIC LETTER REH
|
21 |
+
::s ز ::t z ::comment ARABIC LETTER ZAIN
|
22 |
+
::s س ::t s ::comment ARABIC LETTER SEEN
|
23 |
+
::s ش ::t sh ::comment ARABIC LETTER SHEEN
|
24 |
+
::s ص ::t s ::comment ARABIC LETTER SAD
|
25 |
+
::s ض ::t d ::comment ARABIC LETTER DAD
|
26 |
+
::s ط ::t t ::comment ARABIC LETTER TAH
|
27 |
+
::s ظ ::t z ::comment ARABIC LETTER ZAH
|
28 |
+
::s ع ::t ' ::comment ARABIC LETTER AIN
|
29 |
+
::s غ ::t gh ::comment ARABIC LETTER GHAIN
|
30 |
+
::s ـ ::t - ::comment ARABIC TATWEEL
|
31 |
+
::s ف ::t f ::comment ARABIC LETTER FEH
|
32 |
+
::s ق ::t q ::comment ARABIC LETTER QAF
|
33 |
+
::s ك ::t k ::comment ARABIC LETTER KAF
|
34 |
+
::s ل ::t l ::comment ARABIC LETTER LAM
|
35 |
+
::s م ::t m ::comment ARABIC LETTER MEEM
|
36 |
+
::s ن ::t n ::comment ARABIC LETTER NOON
|
37 |
+
::s ه ::t h ::comment ARABIC LETTER HEH
|
38 |
+
::s و ::t w ::comment ARABIC LETTER WAW
|
39 |
+
::s ى ::t a ::comment ARABIC LETTER ALEF MAKSURA
|
40 |
+
::s ي ::t y ::comment ARABIC LETTER YEH
|
41 |
+
::s َ ::t a ::comment ARABIC FATHA
|
42 |
+
::s ُ ::t u ::comment ARABIC DAMMA
|
43 |
+
::s ِ ::t i ::comment ARABIC KASRA
|
44 |
+
::s ْ ::t ::comment ARABIC SUKUN
|
45 |
+
::s ٔ ::t ' ::comment ARABIC HAMZA ABOVE
|
46 |
+
::s ٕ ::t ' ::comment ARABIC HAMZA BELOW
|
47 |
+
::s ٠ ::t 0 ::comment ARABIC-INDIC DIGIT ZERO
|
48 |
+
::s ١ ::t 1 ::comment ARABIC-INDIC DIGIT ONE
|
49 |
+
::s ٢ ::t 2 ::comment ARABIC-INDIC DIGIT TWO
|
50 |
+
::s ٣ ::t 3 ::comment ARABIC-INDIC DIGIT THREE
|
51 |
+
::s ٤ ::t 4 ::comment ARABIC-INDIC DIGIT FOUR
|
52 |
+
::s ٥ ::t 5 ::comment ARABIC-INDIC DIGIT FIVE
|
53 |
+
::s ٦ ::t 6 ::comment ARABIC-INDIC DIGIT SIX
|
54 |
+
::s ٧ ::t 7 ::comment ARABIC-INDIC DIGIT SEVEN
|
55 |
+
::s ٨ ::t 8 ::comment ARABIC-INDIC DIGIT EIGHT
|
56 |
+
::s ٩ ::t 9 ::comment ARABIC-INDIC DIGIT NINE
|
57 |
+
::s ٪ ::t % ::comment ARABIC PERCENT SIGN
|
58 |
+
::s ٫ ::t , ::comment ARABIC DECIMAL SEPARATOR
|
59 |
+
::s ٬ ::t , ::comment ARABIC THOUSANDS SEPARATOR
|
60 |
+
::s ٮ ::t b ::comment ARABIC LETTER DOTLESS BEH
|
61 |
+
::s ٯ ::t q ::comment ARABIC LETTER DOTLESS QAF
|
62 |
+
::s ٰ ::t a ::comment ARABIC LETTER SUPERSCRIPT ALEF
|
63 |
+
::s ٱ ::t a ::comment ARABIC LETTER ALEF WASLA
|
64 |
+
::s ٲ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
|
65 |
+
::s ٳ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
|
66 |
+
::s ٷ ::t u ::comment ARABIC LETTER U WITH HAMZA ABOVE
|
67 |
+
::s ٹ ::t tt ::comment ARABIC LETTER TTEH
|
68 |
+
::s ٺ ::t tt ::comment ARABIC LETTER TTEHEH
|
69 |
+
::s ٻ ::t b ::comment ARABIC LETTER BEEH
|
70 |
+
::s ټ ::t t ::comment ARABIC LETTER TEH WITH RING
|
71 |
+
::s ٽ ::t t ::comment ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
|
72 |
+
::s پ ::t p ::comment ARABIC LETTER PEH
|
73 |
+
::s ٿ ::t t ::comment ARABIC LETTER TEHEH
|
74 |
+
::s ڀ ::t b ::comment ARABIC LETTER BEHEH
|
75 |
+
::s ځ ::t h ::comment ARABIC LETTER HAH WITH HAMZA ABOVE
|
76 |
+
::s ڂ ::t h ::comment ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
|
77 |
+
::s ڃ ::t ny ::comment ARABIC LETTER NYEH
|
78 |
+
::s ڄ ::t dy ::comment ARABIC LETTER DYEH
|
79 |
+
::s څ ::t h ::comment ARABIC LETTER HAH WITH THREE DOTS ABOVE
|
80 |
+
::s چ ::t tch ::comment ARABIC LETTER TCHEH
|
81 |
+
::s ڇ ::t tch ::comment ARABIC LETTER TCHEHEH
|
82 |
+
::s ڈ ::t dd ::comment ARABIC LETTER DDAL
|
83 |
+
::s ډ ::t d ::comment ARABIC LETTER DAL WITH RING
|
84 |
+
::s ڊ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW
|
85 |
+
::s ڋ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
|
86 |
+
::s ڌ ::t d ::comment ARABIC LETTER DAHAL
|
87 |
+
::s ڍ ::t dd ::comment ARABIC LETTER DDAHAL
|
88 |
+
::s ڎ ::t d ::comment ARABIC LETTER DUL
|
89 |
+
::s ڏ ::t d ::comment ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
|
90 |
+
::s ڐ ::t d ::comment ARABIC LETTER DAL WITH FOUR DOTS ABOVE
|
91 |
+
::s ڑ ::t rr ::comment ARABIC LETTER RREH
|
92 |
+
::s ڒ ::t r ::comment ARABIC LETTER REH WITH SMALL V
|
93 |
+
::s ړ ::t r ::comment ARABIC LETTER REH WITH RING
|
94 |
+
::s ڔ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW
|
95 |
+
::s ڕ ::t r ::comment ARABIC LETTER REH WITH SMALL V BELOW
|
96 |
+
::s ږ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
|
97 |
+
::s ڗ ::t r ::comment ARABIC LETTER REH WITH TWO DOTS ABOVE
|
98 |
+
::s ژ ::t j ::comment ARABIC LETTER JEH
|
99 |
+
::s ڙ ::t r ::comment ARABIC LETTER REH WITH FOUR DOTS ABOVE
|
100 |
+
::s ښ ::t s ::comment ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
|
101 |
+
::s ڛ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW
|
102 |
+
::s ڜ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
|
103 |
+
::s ڝ ::t s ::comment ARABIC LETTER SAD WITH TWO DOTS BELOW
|
104 |
+
::s ڞ ::t s ::comment ARABIC LETTER SAD WITH THREE DOTS ABOVE
|
105 |
+
::s ڟ ::t t ::comment ARABIC LETTER TAH WITH THREE DOTS ABOVE
|
106 |
+
::s ڠ ::t n ::comment ARABIC LETTER AIN WITH THREE DOTS ABOVE
|
107 |
+
::s ڡ ::t f ::comment ARABIC LETTER DOTLESS FEH
|
108 |
+
::s ڢ ::t f ::comment ARABIC LETTER FEH WITH DOT MOVED BELOW
|
109 |
+
::s ڣ ::t f ::comment ARABIC LETTER FEH WITH DOT BELOW
|
110 |
+
::s ڤ ::t v ::comment ARABIC LETTER VEH
|
111 |
+
::s ڥ ::t f ::comment ARABIC LETTER FEH WITH THREE DOTS BELOW
|
112 |
+
::s ڦ ::t p ::comment ARABIC LETTER PEHEH
|
113 |
+
::s ڧ ::t q ::comment ARABIC LETTER QAF WITH DOT ABOVE
|
114 |
+
::s ڨ ::t q ::comment ARABIC LETTER QAF WITH THREE DOTS ABOVE
|
115 |
+
::s ک ::t k ::comment ARABIC LETTER KEHEH
|
116 |
+
::s ڪ ::t k ::comment ARABIC LETTER SWASH KAF
|
117 |
+
::s ګ ::t k ::comment ARABIC LETTER KAF WITH RING
|
118 |
+
::s ڬ ::t k ::comment ARABIC LETTER KAF WITH DOT ABOVE
|
119 |
+
::s ڭ ::t ng ::comment ARABIC LETTER NG
|
120 |
+
::s ڮ ::t k ::comment ARABIC LETTER KAF WITH THREE DOTS BELOW
|
121 |
+
::s گ ::t g ::comment ARABIC LETTER GAF
|
122 |
+
::s ڰ ::t g ::comment ARABIC LETTER GAF WITH RING
|
123 |
+
::s ڱ ::t ng ::comment ARABIC LETTER NGOEH
|
124 |
+
::s ڲ ::t g ::comment ARABIC LETTER GAF WITH TWO DOTS BELOW
|
125 |
+
::s ڳ ::t g ::comment ARABIC LETTER GUEH
|
126 |
+
::s ڴ ::t g ::comment ARABIC LETTER GAF WITH THREE DOTS ABOVE
|
127 |
+
::s ڵ ::t l ::comment ARABIC LETTER LAM WITH SMALL V
|
128 |
+
::s ڶ ::t l ::comment ARABIC LETTER LAM WITH DOT ABOVE
|
129 |
+
::s ڷ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS ABOVE
|
130 |
+
::s ڸ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS BELOW
|
131 |
+
::s ڹ ::t n ::comment ARABIC LETTER NOON WITH DOT BELOW
|
132 |
+
::s ں ::t n ::comment ARABIC LETTER NOON GHUNNA
|
133 |
+
::s ڻ ::t rn ::comment ARABIC LETTER RNOON
|
134 |
+
::s ڼ ::t n ::comment ARABIC LETTER NOON WITH RING
|
135 |
+
::s ڽ ::t n ::comment ARABIC LETTER NOON WITH THREE DOTS ABOVE
|
136 |
+
::s ھ ::t h ::comment ARABIC LETTER HEH DOACHASHMEE
|
137 |
+
::s ڿ ::t tch ::comment ARABIC LETTER TCHEH WITH DOT ABOVE
|
138 |
+
::s ۀ ::t h ::comment ARABIC LETTER HEH WITH YEH ABOVE
|
139 |
+
::s ہ ::t h ::comment ARABIC LETTER HEH GOAL
|
140 |
+
::s ۂ ::t h ::comment ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
|
141 |
+
::s ۃ ::t a ::comment ARABIC LETTER TEH MARBUTA GOAL
|
142 |
+
::s ۄ ::t w ::comment ARABIC LETTER WAW WITH RING
|
143 |
+
::s ۅ ::t oe ::comment ARABIC LETTER KIRGHIZ OE
|
144 |
+
::s ۆ ::t oe ::comment ARABIC LETTER OE
|
145 |
+
::s ۇ ::t u ::comment ARABIC LETTER U
|
146 |
+
::s ۈ ::t yu ::comment ARABIC LETTER YU
|
147 |
+
::s ۉ ::t yu ::comment ARABIC LETTER KIRGHIZ YU
|
148 |
+
::s ۊ ::t w ::comment ARABIC LETTER WAW WITH TWO DOTS ABOVE
|
149 |
+
::s ۋ ::t v ::comment ARABIC LETTER VE
|
150 |
+
::s ی ::t y ::comment ARABIC LETTER FARSI YEH
|
151 |
+
::s ۍ ::t y ::comment ARABIC LETTER YEH WITH TAIL
|
152 |
+
::s ێ ::t y ::comment ARABIC LETTER YEH WITH SMALL V
|
153 |
+
::s ۏ ::t w ::comment ARABIC LETTER WAW WITH DOT ABOVE
|
154 |
+
::s ې ::t e ::comment ARABIC LETTER E
|
155 |
+
::s ۑ ::t y ::comment ARABIC LETTER YEH WITH THREE DOTS BELOW
|
156 |
+
::s ے ::t y ::comment ARABIC LETTER YEH BARREE
|
157 |
+
::s ۓ ::t y ::comment ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
|
158 |
+
::s ۔ ::t . ::comment ARABIC FULL STOP
|
159 |
+
::s ە ::t ae ::comment ARABIC LETTER AE
|
160 |
+
::s ۮ ::t d ::comment ARABIC LETTER DAL WITH INVERTED V
|
161 |
+
::s ۯ ::t r ::comment ARABIC LETTER REH WITH INVERTED V
|
162 |
+
::s ۰ ::t 0 ::comment EXTENDED ARABIC-INDIC DIGIT ZERO
|
163 |
+
::s ۱ ::t 1 ::comment EXTENDED ARABIC-INDIC DIGIT ONE
|
164 |
+
::s ۲ ::t 2 ::comment EXTENDED ARABIC-INDIC DIGIT TWO
|
165 |
+
::s ۳ ::t 3 ::comment EXTENDED ARABIC-INDIC DIGIT THREE
|
166 |
+
::s ۴ ::t 4 ::comment EXTENDED ARABIC-INDIC DIGIT FOUR
|
167 |
+
::s ۵ ::t 5 ::comment EXTENDED ARABIC-INDIC DIGIT FIVE
|
168 |
+
::s ۶ ::t 6 ::comment EXTENDED ARABIC-INDIC DIGIT SIX
|
169 |
+
::s ۷ ::t 7 ::comment EXTENDED ARABIC-INDIC DIGIT SEVEN
|
170 |
+
::s ۸ ::t 8 ::comment EXTENDED ARABIC-INDIC DIGIT EIGHT
|
171 |
+
::s ۹ ::t 9 ::comment EXTENDED ARABIC-INDIC DIGIT NINE
|
172 |
+
::s ۺ ::t sh ::comment ARABIC LETTER SHEEN WITH DOT BELOW
|
173 |
+
::s ۻ ::t d ::comment ARABIC LETTER DAD WITH DOT BELOW
|
174 |
+
::s ۼ ::t gh ::comment ARABIC LETTER GHAIN WITH DOT BELOW
|
175 |
+
::s ۽ ::t & ::comment ARABIC SIGN SINDHI AMPERSAND
|
176 |
+
::s ﷲ ::t allah ::comment ARABIC LIGATURE ALLAH ISOLATED FORM
|
177 |
+
|
178 |
+
::s ::t ::comment ZERO WIDTH NON-JOINER
|
179 |
+
::s ::t ::comment ZERO WIDTH JOINER
|
uroman/data/romanization-table.txt
ADDED
@@ -0,0 +1,2019 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## European Latin extensions
|
3 |
+
# Vowels
|
4 |
+
::s Ä ::t Ae
|
5 |
+
::s Ö ::t Oe
|
6 |
+
::s Ü ::t Ue
|
7 |
+
::s Å ::t Aa
|
8 |
+
::s Æ ::t Ae
|
9 |
+
::s Ø ::t oe
|
10 |
+
::s Œ ::t Oe
|
11 |
+
::s ä ::t ae
|
12 |
+
::s ö ::t oe
|
13 |
+
::s ü ::t ue
|
14 |
+
::s å ::t aa
|
15 |
+
::s æ ::t ae
|
16 |
+
::s ø ::t oe
|
17 |
+
::s œ ::t oe
|
18 |
+
# Consonants
|
19 |
+
::s Ç ::t S
|
20 |
+
::s ç ::t s
|
21 |
+
::s Ç ::t Ch ::lcode tur
|
22 |
+
::s ç ::t ch ::lcode tur
|
23 |
+
::s Ş ::t Sh
|
24 |
+
::s ş ::t sh
|
25 |
+
::s Ș ::t Sh
|
26 |
+
::s ș ::t sh
|
27 |
+
::s ß ::t ss
|
28 |
+
::s Ț ::t Ts
|
29 |
+
::s ț ::t ts
|
30 |
+
|
31 |
+
# Digraphs
|
32 |
+
# ::s ʣ ::t dz
|
33 |
+
::s ʤ ::t dzh ::comment Latin small letter dezh digraph
|
34 |
+
# ::s ʥ ::t dz
|
35 |
+
# ::s ʦ ::t ts
|
36 |
+
::s ʧ ::t tsh ::comment Latin small letter tesh digraph
|
37 |
+
# ::s ʨ ::t tc
|
38 |
+
|
39 |
+
# Miscellaneous
|
40 |
+
::s ə ::t e
|
41 |
+
|
42 |
+
# English
|
43 |
+
::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
|
44 |
+
::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
|
45 |
+
::s eight ::t eight ::t-alt eit ::example eight, weight
|
46 |
+
::s Eight ::t Eight ::t-alt Eit ::example Eighteen
|
47 |
+
::s ight ::t ight ::t-alt ait ::example Knight
|
48 |
+
::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
|
49 |
+
::s high ::t high ::t-alt hai ::example highlight
|
50 |
+
::s High ::t High ::t-alt Hai ::example High School
|
51 |
+
::s Isle ::t Isle ::t-alt Ail ::use-only-for-whole-word ::example Isle
|
52 |
+
::s Island ::t Island ::t-alt Ailand ::use-only-for-whole-word ::example Island
|
53 |
+
::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
|
54 |
+
::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
|
55 |
+
::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
|
56 |
+
::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
|
57 |
+
::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
|
58 |
+
::s ph ::t ph ::t-alt f ::example alpha
|
59 |
+
::s Ph ::t Ph ::t-alt F ::example Philip
|
60 |
+
::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
|
61 |
+
::s tion ::t tion ::t-alt shen ::example
|
62 |
+
::s Sean ::t Sean ::t-alt Shawn ::use-only-for-whole-word
|
63 |
+
::s ssion ::t ssion ::t-alt shen ::example Sessions
|
64 |
+
::s St ::t St ::t-alt Saint ::use-only-for-whole-word
|
65 |
+
::s St. ::t St. ::t-alt Saint ::use-only-for-whole-word
|
66 |
+
::s Wr ::t Wr ::t-alt R ::example Wren
|
67 |
+
::s wr ::t wr ::t-alt r ::example Cartwright
|
68 |
+
::s x ::t x ::t-alt ks ::example Mexico
|
69 |
+
::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
|
70 |
+
|
71 |
+
# French
|
72 |
+
::s â ::t a ::t-alt as ::example pâte/paste, pastry
|
73 |
+
::s ê ::t e ::t-alt es ::example fête/feast
|
74 |
+
::s î ::t i ::t-alt is ::example île/isle
|
75 |
+
::s ô ::t o ::t-alt os ::example côte/coast
|
76 |
+
::s û ::t u ::t-alt us ::example août/August
|
77 |
+
::s eaux ::t eaux ::t-alt o ::example Bordeaux
|
78 |
+
::s eau ::t eau ::t-alt o ::example Chateau
|
79 |
+
::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
|
80 |
+
::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
|
81 |
+
::s oux ::t oux ::t-alt u
|
82 |
+
::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
|
83 |
+
|
84 |
+
# German
|
85 |
+
::s Sch ::t Sch ::t-alt Sh
|
86 |
+
::s sch ::t sch ::t-alt sh
|
87 |
+
::s stein ::t stein ::t-alt shtain
|
88 |
+
::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
|
89 |
+
|
90 |
+
# Dutch
|
91 |
+
::s ij ::t ij ::t-alt ai
|
92 |
+
::s Ij ::t Ij ::t-alt Ai
|
93 |
+
|
94 |
+
# Latvian
|
95 |
+
::s Ā ::t A ::t-alt Aa ::lcode lav
|
96 |
+
::s ā ::t a ::t-alt aa ::lcode lav
|
97 |
+
::s Ē ::t E ::t-alt Ee ::lcode lav
|
98 |
+
::s ē ::t e ::t-alt ee ::lcode lav
|
99 |
+
::s Ī ::t I ::t-alt Ii ::lcode lav
|
100 |
+
::s ī ::t i ::t-alt ii ::lcode lav
|
101 |
+
::s Ū ::t U ::t-alt Uu ::lcode lav
|
102 |
+
::s ū ::t u ::t-alt uu ::lcode lav
|
103 |
+
::s Ģ ::t G ::t-alt Gj ::lcode lav
|
104 |
+
::s ģ ::t g ::t-alt gj ::lcode lav
|
105 |
+
::s Ķ ::t K ::t-alt Kj ::lcode lav
|
106 |
+
::s ķ ::t k ::t-alt kj ::lcode lav
|
107 |
+
::s Ļ ::t L ::t-alt Lj ::lcode lav
|
108 |
+
::s ļ ::t l ::t-alt lj ::lcode lav
|
109 |
+
::s Ņ ::t N ::t-alt Nj ::lcode lav
|
110 |
+
::s ņ ::t n ::t-alt nj ::lcode lav
|
111 |
+
::s C ::t C ::t-alt Ts ::lcode lav
|
112 |
+
::s c ::t c ::t-alt ts ::lcode lav
|
113 |
+
::s Č ::t C ::t-alt Tsh ::lcode lav
|
114 |
+
::s č ::t c ::t-alt tsh ::lcode lav
|
115 |
+
::s Š ::t Sh ::t-alt s ::lcode lav
|
116 |
+
::s š ::t sh ::t-alt s ::lcode lav
|
117 |
+
::s Ž ::t Z ::t-alt Zh ::lcode lav
|
118 |
+
::s ž ::t z ::t-alt zh ::lcode lav
|
119 |
+
|
120 |
+
# Lithuanian
|
121 |
+
::s C ::t C ::t-alt Ts ::lcode lit
|
122 |
+
::s c ::t c ::t-alt ts ::lcode lit
|
123 |
+
::s Č ::t C ::t-alt Tsh ::lcode lit
|
124 |
+
::s č ::t c ::t-alt tsh ::lcode lit
|
125 |
+
::s Š ::t Sh ::t-alt s ::lcode lit
|
126 |
+
::s š ::t sh ::t-alt s ::lcode lit
|
127 |
+
::s Ž ::t Z ::t-alt Zh ::lcode lit
|
128 |
+
::s ž ::t z ::t-alt zh ::lcode lit
|
129 |
+
|
130 |
+
# International Greek (e.g. as used in chemical compounds)
|
131 |
+
::s β ::t b
|
132 |
+
::s Β ::t B
|
133 |
+
::s ϐ ::t b
|
134 |
+
|
135 |
+
# Ancient Greek
|
136 |
+
::s β ::t b ::lcode grc
|
137 |
+
::s Β ::t B ::lcode grc
|
138 |
+
::s γγ ::t ng ::lcode grc
|
139 |
+
::s γκ ::t nk ::lcode grc
|
140 |
+
::s γξ ::t nx ::lcode grc
|
141 |
+
::s γχ ::t nch ::lcode grc
|
142 |
+
::s ϱ ::t r ::lcode grc
|
143 |
+
|
144 |
+
# Pontic Greek
|
145 |
+
::s β ::t v ::t-alt b ::lcode pnt
|
146 |
+
::s Β ::t V ::t-alt B ::lcode pnt
|
147 |
+
::s ϐ ::t v ::t-alt b ::lcode pnt
|
148 |
+
|
149 |
+
# Modern Greek (generally the default)
|
150 |
+
::s β ::t v ::t-alt b ::lcode ell
|
151 |
+
::s Β ::t V ::t-alt B ::lcode ell
|
152 |
+
::s ϐ ::t v ::t-alt b ::lcode ell
|
153 |
+
::s Ι ::t I
|
154 |
+
::s ι ::t i
|
155 |
+
::s ί ::t i
|
156 |
+
::s ἶ ::t i
|
157 |
+
::s Υ ::t Y
|
158 |
+
::s υ ::t y
|
159 |
+
::s Ρ ::t R
|
160 |
+
::s ρ ::t r
|
161 |
+
::s ϱ ::t r
|
162 |
+
::s Χ ::t Ch ::t-alt Kh
|
163 |
+
::s χ ::t ch ::t-alt kh
|
164 |
+
::s φ ::t f ::t-alt ph
|
165 |
+
::s Φ ::t F ::t-alt Ph
|
166 |
+
::s Ντ ::t D
|
167 |
+
::s ντ ::t nd ::t-alt d, nt
|
168 |
+
# ::s ντζ ::t ntz
|
169 |
+
::s Μπ ::t B
|
170 |
+
::s μπ ::t b ::use-only-at-start-of-word
|
171 |
+
::s μπ ::t mb ::t-alt b, mp ::dont-use-at-start-of-word
|
172 |
+
::s λμπ ::t lb
|
173 |
+
::s νμπ ::t nb
|
174 |
+
::s ρμπ ::t rb
|
175 |
+
::s γγ ::t ng
|
176 |
+
::s Γκ ::t G
|
177 |
+
::s γκ ::t ng ::t-alt g ::dont-use-at-start-of-word
|
178 |
+
::s γκ ::t g ::use-only-at-start-of-word
|
179 |
+
::s γξ ::t nx ::lcode grc
|
180 |
+
::s γχ ::t nch ::lcode grc
|
181 |
+
::s ει ::t ei ::t-alt i
|
182 |
+
::s Ει ::t Ei ::t-alt I
|
183 |
+
::s ευ ::t eu ::t-alt ev ::comment donated by Constantine
|
184 |
+
::s Ευ ::t Eu ::t-alt Ev ::comment donated by Constantine
|
185 |
+
::s αυ ::t au ::t-alt av
|
186 |
+
::s Αυ ::t Au ::t-alt Av
|
187 |
+
::s ου ::t ou ::t-alt u
|
188 |
+
::s Ου ::t Ou ::t-alt U
|
189 |
+
::s ηυ ::t eu
|
190 |
+
::s Ηυ ::t Eu
|
191 |
+
::s υι ::t ui
|
192 |
+
::s Υι ::t Ui
|
193 |
+
::s ωυ ::t ou
|
194 |
+
::s Ωυ ::t Ou
|
195 |
+
::s ͺ ::t ::comment GREEK YPOGEGRAMMENI (U+037A)
|
196 |
+
::s ϒ ::t Y ::comment GREEK UPSILON WITH HOOK SYMBOL (U+03D2)
|
197 |
+
::s ϓ ::t Y ::comment GREEK UPSILON WITH ACUTE AND HOOK SYMBOL (U+03D3)
|
198 |
+
::s ϔ ::t Y ::comment GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL (U+03D4)
|
199 |
+
::s ι ::t ::comment GREEK PROSGEGRAMMENI (U+1FBE)
|
200 |
+
::s ᾿ ::t ::comment GREEK PSILI (U+1FBF)
|
201 |
+
::s ῀ ::t ::comment GREEK PERISPOMENI (U+1FC0)
|
202 |
+
::s ` ::t ::comment GREEK VARIA (U+1FEF)
|
203 |
+
::s ´ ::t ::comment GREEK OXIA (U+1FFD)
|
204 |
+
|
205 |
+
# Glagolitic
|
206 |
+
::s Ⰿ ::t M ::comment GLAGOLITIC CAPITAL LETTER MYSLITE (U+2C0F)
|
207 |
+
::s Ⱞ ::t M ::comment GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE (U+2C2E)
|
208 |
+
::s ⰿ ::t m ::comment GLAGOLITIC SMALL LETTER MYSLITE (U+2C3F)
|
209 |
+
::s ⱞ ::t m ::comment GLAGOLITIC SMALL LETTER LATINATE MYSLITE (U+2C5E)
|
210 |
+
::s 𞀏 ::t m ::comment COMBINING GLAGOLITIC LETTER MYSLITE (U+1E00F)
|
211 |
+
|
212 |
+
# Cyrillic
|
213 |
+
::s Г ::t G ::t-alt H ::comment Cyrillic capital ghe
|
214 |
+
::s г ::t g ::t-alt h ::comment Cyrillic small ghe
|
215 |
+
::s Е ::t E ::t-alt Ye ::comment Cyrillic capital ie
|
216 |
+
::s е ::t e ::t-alt ye ::comment Cyrillic small ie
|
217 |
+
::s Ё ::t E ::t-alt Yo
|
218 |
+
::s ё ::t e ::t-alt yo
|
219 |
+
::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
|
220 |
+
::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
|
221 |
+
::s Щ ::t Shch ::t-alt Sh
|
222 |
+
::s щ ::t shch ::t-alt sh
|
223 |
+
::s Ъ ::t ::comment Cyrillic capital hard sign
|
224 |
+
::s ъ ::t ::comment Cyrillic small hard sign
|
225 |
+
::s ᲆ ::t ::comment CYRILLIC SMALL LETTER TALL HARD SIGN
|
226 |
+
::s Ы ::t Y ::comment Cyrillic capital yeru
|
227 |
+
::s ы ::t y ::comment Cyrillic small yeru
|
228 |
+
::s Ь ::t ::comment Cyrillic capital soft sign
|
229 |
+
::s ь ::t ::comment Cyrillic small soft sign
|
230 |
+
::s Ж ::t Zh ::comment Cyrillic capital letter zhe
|
231 |
+
::s Ш ::t Sh ::comment Cyrillic capital letter sha
|
232 |
+
::s Ч ::t Ch ::comment Cyrillic capital letter che
|
233 |
+
::s Џ ::t Dzh ::comment Cyrillic capital letter dzhe
|
234 |
+
::s Є ::t Ie ::comment Cyrillic capital letter ie
|
235 |
+
::s Ю ::t Yu ::comment Cyrillic capital letter yu
|
236 |
+
::s Я ::t Ya ::comment Cyrillic capital letter ya
|
237 |
+
|
238 |
+
::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
|
239 |
+
::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
|
240 |
+
::s Ә ::t e ::comment Cyrillic capital schwa
|
241 |
+
::s ә ::t e ::comment Cyrillic small schwa
|
242 |
+
::s Ӏ ::t ' ::comment Cyrillic palochka
|
243 |
+
::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
|
244 |
+
::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
|
245 |
+
::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
|
246 |
+
::s ӕ ::t ae ::comment Cyrillic small ligature a ie
|
247 |
+
::s ʹ ::t "'" ::comment modifier letter prime
|
248 |
+
::s ʺ ::t '"' ::comment modifier letter double prime
|
249 |
+
::s ий ::t iy ::dont-use-at-end-of-word
|
250 |
+
::s ий ::t y ::use-only-at-end-of-word
|
251 |
+
|
252 |
+
::s ᲈ ::t u ::comment CYRILLIC SMALL LETTER UNBLENDED UK ligature ou
|
253 |
+
|
254 |
+
# Russian
|
255 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter ghe
|
256 |
+
::s г ::t g ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter ghe
|
257 |
+
::s Й ::t Y ::t-alt I, J ::lcode rus ::comment Cyrillic capital letter short i
|
258 |
+
::s й ::t y ::t-alt i, j ::lcode rus ::comment Cyrillic small letter short i
|
259 |
+
::s Ц ::t Ts ::t-alt C ::lcode rus ::comment Cyrillic capital letter tse
|
260 |
+
::s ц ::t ts ::t-alt c ::lcode rus ::comment Cyrillic small letter tse
|
261 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter shcha
|
262 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter shcha
|
263 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode rus ::comment archaic Cyrillic capital letter yat
|
264 |
+
::s ѣ ::t e ::t-alt ie ::lcode rus ::comment archaic Cyrillic small letter yat
|
265 |
+
::s Е ::t E ::t-alt Ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic capital ie
|
266 |
+
::s Е ::t Ye ::t-alt E ::use-only-at-start-of-word ::lcode rus
|
267 |
+
::s е ::t e ::t-alt ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic small ie
|
268 |
+
::s е ::t ye ::t-alt e ::use-only-at-start-of-word ::lcode rus
|
269 |
+
::s ае ::t aye ::lcode rus
|
270 |
+
::s а́е ::t aye ::lcode rus
|
271 |
+
::s ее ::t eye ::lcode rus
|
272 |
+
::s е́е ::t eye ::lcode rus
|
273 |
+
::s ие ::t iye ::lcode rus
|
274 |
+
::s и́е ::t iye ::lcode rus
|
275 |
+
::s ое ::t oye ::lcode rus
|
276 |
+
::s о́е ::t oye ::lcode rus
|
277 |
+
::s уе ::t uye ::lcode rus
|
278 |
+
::s у́е ::t uye ::lcode rus
|
279 |
+
::s ье ::t ye ::lcode rus
|
280 |
+
::s ъе ::t ye ::lcode rus
|
281 |
+
::s Ё ::t Yo ::t-alt E ::lcode rus ::comment Cyrillic capital io
|
282 |
+
::s ё ::t yo ::t-alt e ::lcode rus
|
283 |
+
::s аё ::t ayo ::lcode rus
|
284 |
+
::s а́ё ::t ayo ::lcode rus
|
285 |
+
::s её ::t eyo ::lcode rus
|
286 |
+
::s е́ё ::t eyo ::lcode rus
|
287 |
+
::s иё ::t iyo ::lcode rus
|
288 |
+
::s и́ё ::t iyo ::lcode rus
|
289 |
+
::s оё ::t oyo ::lcode rus
|
290 |
+
::s о́ё ::t oyo ::lcode rus
|
291 |
+
::s уё ::t uyo ::lcode rus
|
292 |
+
::s у́ё ::t uyo ::lcode rus
|
293 |
+
::s ьё ::t yo ::lcode rus
|
294 |
+
::s ъё ::t yo ::lcode rus
|
295 |
+
::s ий ::t y ::lcode rus
|
296 |
+
|
297 |
+
# Ukranian
|
298 |
+
::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
|
299 |
+
::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
|
300 |
+
::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
|
301 |
+
::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
|
302 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital ie
|
303 |
+
::s е ::t e ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small ie
|
304 |
+
::s И ::t Y ::lcode ukr ::comment Ukrainian capital letter i
|
305 |
+
::s и ::t y ::lcode ukr ::comment Ukrainian small letter i
|
306 |
+
::s Ї ::t Yi ::lcode ukr ::comment Ukrainian capital letter yi
|
307 |
+
::s ї ::t yi ::lcode ukr ::comment Ukrainian small letter yi
|
308 |
+
::s Й ::t I ::t-alt Y ::lcode ukr ::comment Cyrillic capital letter short i
|
309 |
+
::s й ::t i ::t-alt y ::lcode ukr ::comment Cyrillic small letter short i
|
310 |
+
::s Ц ::t Ts ::t-alt C ::lcode ukr ::comment Cyrillic capital letter tse
|
311 |
+
::s ц ::t ts ::t-alt c ::lcode ukr ::comment Cyrillic small letter tse
|
312 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital letter shcha
|
313 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small letter shcha
|
314 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode ukr ::comment archaic Cyrillic capital letter yat
|
315 |
+
::s ѣ ::t e ::t-alt ie ::lcode ukr ::comment archaic Cyrillic small letter yat
|
316 |
+
::s Иї ::t Yi ::lcode ukr ::comment avoid Yyi
|
317 |
+
::s иї ::t yi ::lcode ukr ::comment avoid yyi
|
318 |
+
::s ій ::t iy ::lcode ukr
|
319 |
+
::s і́й ::t iy ::lcode ukr
|
320 |
+
::s ий ::t y ::lcode ukr ::comment Зеленський/Zelensky
|
321 |
+
|
322 |
+
# Belarusian
|
323 |
+
::s Г ::t H ::t-alt G ::lcode bel ::comment capital letter he
|
324 |
+
::s г ::t h ::t-alt g ::lcode bel ::comment small letter he
|
325 |
+
::s Ґ ::t G ::lcode bel ::comment capital letter ghe
|
326 |
+
::s ґ ::t g ::lcode bel ::comment small letter ghe
|
327 |
+
::s Й ::t J ::t-alt Y ::lcode bel ::comment Cyrillic capital letter short i
|
328 |
+
::s й ::t j ::t-alt y ::lcode bel ::comment Cyrillic small letter short i
|
329 |
+
::s Ц ::t Ts ::t-alt C ::lcode bel ::comment Cyrillic capital letter tse
|
330 |
+
::s ц ::t ts ::t-alt c ::lcode bel ::comment Cyrillic small letter tse
|
331 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic capital letter shcha
|
332 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic small letter shcha
|
333 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode bel ::comment archaic Cyrillic capital letter yat
|
334 |
+
::s ѣ ::t e ::t-alt ie ::lcode bel ::comment archaic Cyrillic small letter yat
|
335 |
+
::s 'я ::t ya ::lcode bel
|
336 |
+
::s ’я ::t ya ::lcode bel
|
337 |
+
::s 'і ::t i ::lcode bel
|
338 |
+
::s ’і ::t i ::lcode bel
|
339 |
+
::s Ё ::t Yo ::t-alt E ::lcode bel ::comment Cyrillic capital io
|
340 |
+
::s ё ::t yo ::t-alt e ::lcode bel
|
341 |
+
::s ёў ::t you ::lcode bel
|
342 |
+
::s ий ::t y ::lcode bel
|
343 |
+
|
344 |
+
# Serbian
|
345 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ghe
|
346 |
+
::s г ::t g ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ghe
|
347 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ha
|
348 |
+
::s х ::t h ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ha
|
349 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ie
|
350 |
+
::s е ::t e ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ie
|
351 |
+
::s Ђ ::t Dj ::lcode srp ::comment Cyrillic capital dje
|
352 |
+
::s Љ ::t Lj ::lcode srp ::comment Cyrillic capital lje
|
353 |
+
::s Ћ ::t Tsh ::lcode srp ::comment Cyrillic capital tshe
|
354 |
+
::s Ж ::t Zh ::lcode srp ::comment Cyrillic capital zhe
|
355 |
+
::s Ц ::t C ::t-alt Ts ::lcode srp ::comment Cyrillic capital tse
|
356 |
+
::s ц ::t c ::t-alt ts ::lcode srp ::comment Cyrillic capital tse
|
357 |
+
::s Đ ::t Dj ::lcode srp ::comment Latin capital d with stroke
|
358 |
+
::s đ ::t dj ::lcode srp ::comment Latin small d with stroke
|
359 |
+
::s Ž ::t Zh ::lcode srp ::comment Latin capital z with caron
|
360 |
+
::s ž ::t zh ::lcode srp ::comment Latin small z with caron
|
361 |
+
::s Ć ::t Tsh ::lcode srp ::comment Latin capital c with acute
|
362 |
+
::s ć ::t tsh ::lcode srp ::comment Latin small c with acute
|
363 |
+
::s Č ::t Ch ::lcode srp ::comment Latin capital c with caron
|
364 |
+
::s č ::t ch ::lcode srp ::comment Latin small c with caron
|
365 |
+
::s Š ::t Sh ::lcode srp ::comment Latin capital s with caron
|
366 |
+
::s š ::t sh ::lcode srp ::comment Latin small s with caron
|
367 |
+
|
368 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ghe
|
369 |
+
::s г ::t g ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ghe
|
370 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ha
|
371 |
+
::s х ::t h ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ha
|
372 |
+
::s Ц ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter tse
|
373 |
+
::s ц ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter tse
|
374 |
+
::s Ч ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter che
|
375 |
+
::s ч ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter che
|
376 |
+
::s Џ ::t Dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter dzhe
|
377 |
+
::s џ ::t dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter dzhe
|
378 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ie
|
379 |
+
::s е ::t e ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ie
|
380 |
+
::s Ш ::t S ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital sha
|
381 |
+
::s ш ::t s ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small sha
|
382 |
+
::s Ж ::t Z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital zhe
|
383 |
+
::s ж ::t z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small zhe
|
384 |
+
::s Љ ::t Lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital lje
|
385 |
+
::s љ ::t lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small lje
|
386 |
+
::s Њ ::t Nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital nje
|
387 |
+
::s њ ::t nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small nje
|
388 |
+
::s Ђ ::t Dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital dje
|
389 |
+
::s ђ ::t dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small dje
|
390 |
+
::s Ћ ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital tshe
|
391 |
+
::s ћ ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small tshe
|
392 |
+
::s Đ ::t Dj ::lcode srp2 ::comment Latin capital d with stroke
|
393 |
+
::s đ ::t dj ::lcode srp2 ::comment Latin small d with stroke
|
394 |
+
|
395 |
+
# Montenegrin extension (controversial)
|
396 |
+
::s З́ ::t Zj ::lcode srp ::comment Cyrillic capital zje
|
397 |
+
::s з́ ::t zj ::lcode srp ::comment Cyrillic small zje
|
398 |
+
::s С́ ::t Sj ::lcode srp ::comment Cyrillic capital sje
|
399 |
+
::s с́ ::t sj ::lcode srp ::comment Cyrillic small sje
|
400 |
+
::s Ź ::t Zj ::lcode srp ::comment Latin capital z with acute
|
401 |
+
::s ź ::t zj ::lcode srp ::comment Latin small z with acute
|
402 |
+
::s Ś ::t Sj ::lcode srp ::comment Latin capital s with acute
|
403 |
+
::s ś ::t sj ::lcode srp ::comment Latin small s with acute
|
404 |
+
|
405 |
+
::s З́ ::t Z ::lcode srp2 ::comment Cyrillic capital zje
|
406 |
+
::s з́ ::t z ::lcode srp2 ::comment Cyrillic small zje
|
407 |
+
::s С́ ::t S ::lcode srp2 ::comment Cyrillic capital sje
|
408 |
+
::s с́ ::t s ::lcode srp2 ::comment Cyrillic small sje
|
409 |
+
::s Ź ::t Z ::lcode srp2 ::comment Latin capital z with acute
|
410 |
+
::s ź ::t z ::lcode srp2 ::comment Latin small z with acute
|
411 |
+
::s Ś ::t S ::lcode srp2 ::comment Latin capital s with acute
|
412 |
+
::s ś ::t s ::lcode srp2 ::comment Latin small s with acute
|
413 |
+
|
414 |
+
# Bulgarian
|
415 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ghe
|
416 |
+
::s г ::t g ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ghe
|
417 |
+
::s Х ::t H ::t-alt Kh ::lcode bul ::comment Cyrillic capital letter ha
|
418 |
+
::s х ::t h ::t-alt kh ::lcode bul ::comment Cyrillic small letter ha
|
419 |
+
::s Ц ::t C ::t-alt Ts ::lcode bul ::comment Cyrillic capital letter tse
|
420 |
+
::s ц ::t c ::t-alt ts ::lcode bul ::comment Cyrillic small letter tse
|
421 |
+
::s Щ ::t Sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital letter shcha
|
422 |
+
::s щ ::t sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small letter shcha
|
423 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ie
|
424 |
+
::s е ::t e ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ie
|
425 |
+
::s Ж ::t Zh ::t-alt Z, J ::lcode bul ::comment Cyrillic capital zhe
|
426 |
+
::s ж ::t zh ::t-alt z, j ::lcode bul ::comment Cyrillic small zhe
|
427 |
+
::s Й ::t I ::t-alt Y, J ::lcode bul ::comment Cyrillic capital letter short i
|
428 |
+
::s й ::t i ::t-alt y, j ::lcode bul ::comment Cyrillic short letter short i
|
429 |
+
::s Ю ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Cyrillic capital letter yu
|
430 |
+
::s ю ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Cyrillic small letter yu
|
431 |
+
::s Ъ ::t U ::t-alt A ::lcode bul ::comment Cyrillic capital letter hard sign
|
432 |
+
::s ъ ::t u ::t-alt a ::lcode bul ::comment Cyrillic capital letter hard sign
|
433 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode bul ::comment archaic Cyrillic capital letter yat
|
434 |
+
::s ѣ ::t e ::t-alt ie ::lcode bul ::comment archaic Cyrillic small letter yat
|
435 |
+
::s Ѫ ::t U ::lcode bul ::comment archaic Cyrillic capital letter yus
|
436 |
+
::s ѫ ::t u ::lcode bul ::comment archaic Cyrillic small letter yus
|
437 |
+
::s ИЯ ::t IA ::lcode bul ::use-only-at-end-of-word
|
438 |
+
::s ия ::t ia ::lcode bul ::use-only-at-end-of-word
|
439 |
+
|
440 |
+
::s Ž ::t Zh ::lcode bul ::comment Latin capital z with caron
|
441 |
+
::s ž ::t zh ::lcode bul ::comment Latin small z with caron
|
442 |
+
::s Č ::t Ch ::lcode bul ::comment Latin capital c with caron
|
443 |
+
::s č ::t ch ::lcode bul ::comment Latin small c with caron
|
444 |
+
::s Š ::t Sh ::lcode bul ::comment Latin capital s with caron
|
445 |
+
::s š ::t sh ::lcode bul ::comment Latin small s with caron
|
446 |
+
::s Ŝ ::t Sht ::lcode bul ::comment Latin capital s with circumflex
|
447 |
+
::s ŝ ::t sht ::lcode bul ::comment Latin small s with circumflex
|
448 |
+
::s Û ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Latin capital u with circumflex
|
449 |
+
::s û ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Latin small u with circumflex
|
450 |
+
::s  ::t Ya ::t-alt _NONE_ ::lcode bul ::comment Latin capital a with circumflex
|
451 |
+
::s â ::t ya ::t-alt _NONE_ ::lcode bul ::comment Latin small a with circumflex
|
452 |
+
::s Ŭ ::t U ::t-alt A ::lcode bul ::comment Latin capital u with breve (for hard sign)
|
453 |
+
::s ŭ ::t u ::t-alt a ::lcode bul ::comment Latin small u with breve (for hard sign)
|
454 |
+
::s Ǎ ::t U ::t-alt A ::lcode bul ::comment Latin capital a with caron (for hard sign)
|
455 |
+
::s ǎ ::t u ::t-alt a ::lcode bul ::comment Latin small a with caron (for hard sign)
|
456 |
+
|
457 |
+
# Macedonian
|
458 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ghe
|
459 |
+
::s г ::t g ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ghe
|
460 |
+
::s Х ::t H ::lcode mkd ::comment Cyrillic capital ha
|
461 |
+
::s х ::t h ::lcode mkd ::comment Cyrillic small ha
|
462 |
+
::s Ц ::t C ::t-alt Ts ::lcode mkd ::comment Cyrillic capital letter tse
|
463 |
+
::s ц ::t c ::t-alt ts ::lcode mkd ::comment Cyrillic small letter tse
|
464 |
+
::s Џ ::t Dzh ::t-alt Dj, Dz ::lcode mkd ::comment Cyrillic capital letter dzhe
|
465 |
+
::s џ ::t dzh ::t-alt dj, dz ::lcode mkd ::comment Cyrillic small letter dzhe
|
466 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ie
|
467 |
+
::s е ::t e ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ie
|
468 |
+
::s Ž ::t Zh ::lcode mkd ::comment Latin capital z with caron
|
469 |
+
::s ž ::t zh ::lcode mkd ::comment Latin small z with caron
|
470 |
+
::s Č ::t Ch ::lcode mkd ::comment Latin capital c with caron
|
471 |
+
::s č ::t ch ::lcode mkd ::comment Latin small c with caron
|
472 |
+
::s Š ::t Sh ::lcode mkd ::comment Latin capital s with caron
|
473 |
+
::s š ::t sh ::lcode mkd ::comment Latin small s with caron
|
474 |
+
::s Ǵ ::t Gj ::lcode mkd
|
475 |
+
::s ǵ ::t gj ::lcode mkd
|
476 |
+
::s Đ ::t Gj ::lcode mkd
|
477 |
+
::s đ ::t gj ::lcode mkd
|
478 |
+
::s Ẑ ::t Dz ::lcode mkd
|
479 |
+
::s ẑ ::t dz ::lcode mkd
|
480 |
+
::s J̌ ::t J ::lcode mkd
|
481 |
+
::s ǰ ::t j ::lcode mkd
|
482 |
+
::s L̂ ::t Lj ::lcode mkd
|
483 |
+
::s l̂ ::t lj ::lcode mkd
|
484 |
+
::s N̂ ::t Nj ::lcode mkd
|
485 |
+
::s n̂ ::t nj ::lcode mkd
|
486 |
+
::s Ḱ ::t Kj ::lcode mkd
|
487 |
+
::s ḱ ::t kj ::lcode mkd
|
488 |
+
::s Ć ::t Kj ::lcode mkd
|
489 |
+
::s ć ::t kj ::lcode mkd
|
490 |
+
::s D̂ ::t Dzh ::lcode mkd
|
491 |
+
::s d̂ ::t dzh ::lcode mkd
|
492 |
+
|
493 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ghe
|
494 |
+
::s г ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ghe
|
495 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ha
|
496 |
+
::s х ::t h ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ha
|
497 |
+
::s Ц ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter tse
|
498 |
+
::s ц ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter tse
|
499 |
+
::s Ч ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter che
|
500 |
+
::s ч ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter che
|
501 |
+
::s Џ ::t D ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter dzhe
|
502 |
+
::s џ ::t d ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter dzhe
|
503 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ie
|
504 |
+
::s е ::t e ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ie
|
505 |
+
::s Ш ::t S ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital sha
|
506 |
+
::s ш ::t s ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small sha
|
507 |
+
::s Ѓ ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital gje
|
508 |
+
::s ѓ ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small gje
|
509 |
+
::s Ж ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital zhe
|
510 |
+
::s ж ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small zhe
|
511 |
+
::s Ѕ ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital dze
|
512 |
+
::s ѕ ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small dze
|
513 |
+
::s Ќ ::t K ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital kje
|
514 |
+
::s ќ ::t k ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small kje
|
515 |
+
::s Љ ::t L ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital lje
|
516 |
+
::s љ ::t l ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small lje
|
517 |
+
::s Њ ::t N ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital nje
|
518 |
+
::s њ ::t n ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small nje
|
519 |
+
::s Ž ::t Z ::lcode mkd2 ::comment Latin capital z with caron
|
520 |
+
::s ž ::t z ::lcode mkd2 ::comment Latin small z with caron
|
521 |
+
::s Č ::t C ::lcode mkd2 ::comment Latin capital c with caron
|
522 |
+
::s č ::t c ::lcode mkd2 ::comment Latin small c with caron
|
523 |
+
::s Š ::t S ::lcode mkd2 ::comment Latin capital s with caron
|
524 |
+
::s š ::t s ::lcode mkd2 ::comment Latin small s with caron
|
525 |
+
::s Ǵ ::t G ::lcode mkd2
|
526 |
+
::s ǵ ::t g ::lcode mkd2
|
527 |
+
::s Đ ::t G ::lcode mkd2
|
528 |
+
::s đ ::t g ::lcode mkd2
|
529 |
+
::s Ẑ ::t D ::lcode mkd2
|
530 |
+
::s ẑ ::t d ::lcode mkd2
|
531 |
+
::s J̌ ::t J ::lcode mkd2
|
532 |
+
::s ǰ ::t j ::lcode mkd2
|
533 |
+
::s L̂ ::t L ::lcode mkd2
|
534 |
+
::s l̂ ::t l ::lcode mkd2
|
535 |
+
::s N̂ ::t N ::lcode mkd2
|
536 |
+
::s n̂ ::t n ::lcode mkd2
|
537 |
+
::s Ḱ ::t K ::lcode mkd2
|
538 |
+
::s ḱ ::t k ::lcode mkd2
|
539 |
+
::s Ć ::t K ::lcode mkd2
|
540 |
+
::s ć ::t k ::lcode mkd2
|
541 |
+
::s D̂ ::t D ::lcode mkd2
|
542 |
+
::s d̂ ::t d ::lcode mkd2
|
543 |
+
|
544 |
+
# Kazakh
|
545 |
+
::s Ә ::t A ::lcode kaz
|
546 |
+
::s ә ::t a ::lcode kaz
|
547 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe
|
548 |
+
::s г ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe
|
549 |
+
::s Ғ ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe with stroke
|
550 |
+
::s ғ ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe with stroke
|
551 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ie
|
552 |
+
::s е ::t e ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ie
|
553 |
+
::s Ё ::t Yo ::t-alt _NONE_ ::lcode kaz
|
554 |
+
::s ё ::t yo ::t-alt _NONE_ ::lcode kaz
|
555 |
+
::s Х ::t H ::t-alt X ::lcode kaz ::comment Cyrillic capital ha
|
556 |
+
::s х ::t h ::t-alt x ::lcode kaz ::comment Cyrillic small ha
|
557 |
+
::s Һ ::t H ::lcode kaz ::comment Cyrillic capital shha
|
558 |
+
::s һ ::t h ::lcode kaz ::comment Cyrillic small shha
|
559 |
+
::s Қ ::t Q ::t-alt K ::lcode kaz
|
560 |
+
::s қ ::t q ::t-alt k ::lcode kaz
|
561 |
+
::s Ц ::t Ts ::t-alt C ::lcode kaz ::comment Cyrillic capital letter tse
|
562 |
+
::s ц ::t ts ::t-alt c ::lcode kaz ::comment Cyrillic small letter tse
|
563 |
+
::s Щ ::t Sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital letter shcha
|
564 |
+
::s щ ::t sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small letter shcha
|
565 |
+
::s У ::t U ::t-alt Y ::lcode kaz
|
566 |
+
::s у ::t u ::t-alt y ::lcode kaz
|
567 |
+
::s уы ::t wy ::lcode kaz
|
568 |
+
::s Ж ::t J ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital zhe
|
569 |
+
::s ж ::t j ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small zhe
|
570 |
+
::s Ю ::t Yw ::t-alt Yuw, Yiw ::lcode kaz ::comment Cyrillic capital letter yu
|
571 |
+
::s ю ::t yw ::t-alt yuw, yiw ::lcode kaz ::comment Cyrillic small letter yu
|
572 |
+
|
573 |
+
# Kyrgyz
|
574 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode kir ::comment Cyrillic capital ghe
|
575 |
+
::s г ::t g ::t-alt _NONE_ ::lcode kir ::comment Cyrillic small ghe
|
576 |
+
::s Е ::t E ::t-alt Ye ::lcode kir ::comment Cyrillic capital ie
|
577 |
+
::s е ::t e ::t-alt ye ::lcode kir ::comment Cyrillic small ie
|
578 |
+
::s Ё ::t Yo ::t-alt _NONE_ ::lcode kir
|
579 |
+
::s ё ::t yo ::t-alt _NONE_ ::lcode kir
|
580 |
+
::s Х ::t Kh ::t-alt X, H ::lcode kir ::comment Cyrillic capital ha
|
581 |
+
::s х ::t kh ::t-alt x, h ::lcode kir ::comment Cyrillic small ha
|
582 |
+
::s Ж ::t Zh ::t-alt J ::lcode kir ::comment Cyrillic capital zhe
|
583 |
+
::s ж ::t zh ::t-alt j ::lcode kir ::comment Cyrillic small zhe
|
584 |
+
::s Й ::t Y ::t-alt I ::lcode kir ::comment Cyrillic capital letter short i
|
585 |
+
::s й ::t y ::t-alt i ::lcode kir ::comment Cyrillic small letter short i
|
586 |
+
::s Ц ::t Ts ::t-alt C ::lcode kir ::comment Cyrillic capital letter tse
|
587 |
+
::s ц ::t ts ::t-alt c ::lcode kir ::comment Cyrillic small letter tse
|
588 |
+
::s Ң ::t Ng ::lcode kir
|
589 |
+
::s ң ::t ng ::lcode kir
|
590 |
+
::s Ө ::t O ::t-alt Oe ::lcode kir
|
591 |
+
::s ө ::t o ::t-alt oe ::lcode kir
|
592 |
+
::s Ү ::t U ::t-alt Y, Ue ::lcode kir
|
593 |
+
::s ү ::t u ::t-alt y, ue ::lcode kir
|
594 |
+
::s Ы ::t I ::t-alt Y ::lcode kir
|
595 |
+
::s ы ::t i ::t-alt y ::lcode kir
|
596 |
+
::s йы ::t yi ::lcode kir
|
597 |
+
::s ый ::t iy ::lcode kir
|
598 |
+
|
599 |
+
# Ossetian
|
600 |
+
::s ийы ::t iy ::lcode oss
|
601 |
+
|
602 |
+
# Gothic
|
603 |
+
::s 𐌴 ::t e ::comment Gothic letter aihvus
|
604 |
+
::s 𐌹 ::t i ::comment Gothic letter eis
|
605 |
+
::s 𐍇 ::t x ::comment Gothic letter iggws
|
606 |
+
|
607 |
+
# Runic
|
608 |
+
::s ᛫ ::t " " ::comment Runic single punctuation, used as word separator
|
609 |
+
::s ᛬ ::t . ::comment Runic multiple punctuation, used as sentence separator
|
610 |
+
|
611 |
+
# Ogham
|
612 |
+
::s ᚁ ::t b ::comment Ogham letter Beith
|
613 |
+
::s ᚂ ::t l ::comment Ogham letter Luis
|
614 |
+
::s ᚃ ::t f ::comment Ogham letter Fearn
|
615 |
+
::s ᚄ ::t s ::comment Ogham letter Sail
|
616 |
+
::s ᚅ ::t n ::comment Ogham letter Nion
|
617 |
+
::s ᚋ ::t m ::comment Ogham letter Muin
|
618 |
+
::s ᚌ ::t g ::comment Ogham letter Gort
|
619 |
+
::s ᚍ ::t v ::t-alt ng ::comment Ogham letter nGéadal
|
620 |
+
::s ᚎ ::t z ::comment Ogham letter Straif
|
621 |
+
::s ᚏ ::t r ::comment Ogham letter Ruis
|
622 |
+
::s ᚆ ::t h ::t-alt j ::comment Ogham letter Uath
|
623 |
+
::s ᚇ ::t d ::comment Ogham letter Dair
|
624 |
+
::s ᚈ ::t t ::comment Ogham letter Tinne
|
625 |
+
::s ᚉ ::t k ::comment Ogham letter Coll
|
626 |
+
::s ᚊ ::t q ::t-alt kw ::comment Ogham letter Ceirt
|
627 |
+
::s ᚐ ::t a ::comment Ogham letter Ailm
|
628 |
+
::s ᚑ ::t o ::comment Ogham letter Onn
|
629 |
+
::s ᚒ ::t u ::comment Ogham letter Úr
|
630 |
+
::s ᚓ ::t e ::comment Ogham letter Eadhadh
|
631 |
+
::s ᚔ ::t i ::comment Ogham letter Iodhadh
|
632 |
+
::s ᚚ ::t p ::comment Ogham letter Peith
|
633 |
+
# Additional Ogham letters (outside standard alphabet)
|
634 |
+
::s ᚕ ::t eo ::t-alt ea ::comment Ogham additional letter Éabhadh
|
635 |
+
::s ᚖ ::t oi ::t-alt oe ::comment Ogham additional letter Ór
|
636 |
+
::s ᚗ ::t ui ::t-alt ua ::comment Ogham additional letter Uilleann
|
637 |
+
::s ᚘ ::t p ::t-alt io ::comment Ogham additional letter Ifín
|
638 |
+
::s ᚙ ::t ch ::t-alt x, ai ::comment Ogham additional letter Eamhancholl
|
639 |
+
::s ::t " " ::comment Ogham space mark
|
640 |
+
::s ᚛ ::t "" ::comment Ogham feather mark
|
641 |
+
::s ᚜ ::t "" ::comment Ogham feather mark
|
642 |
+
|
643 |
+
# Georgian
|
644 |
+
::s ა ::t a ::comment Georgian letter an
|
645 |
+
::s ე ::t e ::comment Georgian letter en
|
646 |
+
::s ი ::t i ::comment Georgian letter in
|
647 |
+
::s ო ::t o ::comment Georgian letter on
|
648 |
+
::s უ ::t u ::comment Georgian letter un
|
649 |
+
::s ჱ ::t ey ::comment archaic Georgian letter he
|
650 |
+
::s ჲ ::t i ::comment archaic Georgian letter hie
|
651 |
+
::s ჳ :::t w ::comment archaic Georgian letter we
|
652 |
+
::s ჴ ::t q ::comment archaic Georgian letter har
|
653 |
+
::s ჵ ::t o ::comment archaic Georgian letter hoe
|
654 |
+
::s ჶ ::t f ::comment Georgian letter fi (Greek phi)
|
655 |
+
::s ჷ ::t e ::comment Georgian letter yn (schwa)
|
656 |
+
::s ჸ ::t a ::comment Georgian letter elifi
|
657 |
+
::s ჹ ::t g ::comment Georgian letter gan
|
658 |
+
::s ჺ ::t ' ::comment Georgian letter ain
|
659 |
+
::s ჼ ::t n ::comment Georgian letter nar
|
660 |
+
::s ჽ ::t e ::comment Georgian letter aen
|
661 |
+
::s ჾ ::t ::comment Georgian letter hard sign
|
662 |
+
::s ჿ ::t w ::comment Georgian letter labial sign
|
663 |
+
|
664 |
+
::s Ⴚ ::t TS ::comment GEORGIAN CAPITAL LETTER CAN
|
665 |
+
::s ც ::t ts ::comment GEORGIAN LETTER CAN
|
666 |
+
::s Ც ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CAN
|
667 |
+
::s ⴚ ::t ts ::comment GEORGIAN SMALL LETTER CAN
|
668 |
+
::s Ⴜ ::t TS ::comment GEORGIAN CAPITAL LETTER CIL
|
669 |
+
::s წ ::t ts ::comment GEORGIAN LETTER CIL
|
670 |
+
::s Წ ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CIL
|
671 |
+
::s ⴜ ::t ts ::comment GEORGIAN SMALL LETTER CIL
|
672 |
+
::s Ⴛ ::t DZ ::comment GEORGIAN CAPITAL LETTER JIL
|
673 |
+
::s ძ ::t dz ::comment GEORGIAN LETTER JIL
|
674 |
+
::s Ძ ::t DZ ::comment GEORGIAN MTAVRULI CAPITAL LETTER JIL
|
675 |
+
::s ⴛ ::t dz ::comment GEORGIAN SMALL LETTER JIL
|
676 |
+
::s Ⴟ ::t J ::comment GEORGIAN CAPITAL LETTER JHAN
|
677 |
+
::s ჯ ::t j ::comment GEORGIAN LETTER JHAN
|
678 |
+
::s Ჯ ::t J ::comment GEORGIAN MTAVRULI CAPITAL LETTER JHAN
|
679 |
+
::s ⴟ ::t j ::comment GEORGIAN SMALL LETTER JHAN
|
680 |
+
|
681 |
+
|
682 |
+
::s Ⴀ ::t A ::comment Georgian capital letter an
|
683 |
+
::s Ⴄ ::t E ::comment Georgian capital letter en
|
684 |
+
::s Ⴈ ::t I ::comment Georgian capital letter in
|
685 |
+
::s Ⴍ ::t O ::comment Georgian capital letter on
|
686 |
+
::s Ⴓ ::t U ::comment Georgian capital letter un
|
687 |
+
::s Ⴡ ::t EY ::comment archaic Georgian capital letter he
|
688 |
+
::s Ⴢ ::t I ::comment archaic Georgian capital letter hie
|
689 |
+
::s Ⴣ :::t W ::comment archaic Georgian capitel letter we
|
690 |
+
::s Ⴤ ::t Q ::comment archaic Georgian capital letter har
|
691 |
+
::s Ⴥ ::t O ::comment archaic Georgian capital letter hoe
|
692 |
+
::s Ⴧ ::t E ::comment archaic Georgian capital letter yn (schwa)
|
693 |
+
::s Ⴭ ::t E ::comment archaic Georgian capital letter aen
|
694 |
+
|
695 |
+
::s Ა ::t A ::comment Georgian Mtavruli capital letter an
|
696 |
+
::s Ე ::t E ::comment Georgian Mtavruli capital letter en
|
697 |
+
::s Ი ::t I ::comment Georgian Mtavruli capital letter in
|
698 |
+
::s Ო ::t O ::comment Georgian Mtavruli capital letter on
|
699 |
+
::s Უ ::t U ::comment Georgian Mtavruli capital letter un
|
700 |
+
::s Ჱ ::t EY ::comment archaic Georgian Mtavruli capital letter he
|
701 |
+
::s Ჲ ::t I ::comment archaic Georgian Mtavruli capital letter hie
|
702 |
+
::s Ჳ :::t W ::comment archaic Georgian Mtavruli capital letter we
|
703 |
+
::s Ჴ ::t Q ::comment archaic Georgian Mtavruli capital letter har
|
704 |
+
::s Ჵ ::t O ::comment archaic Georgian Mtavruli capital letter hoe
|
705 |
+
::s Ჶ ::t F ::comment Georgian Mtavruli capital letter fi (Greek phi)
|
706 |
+
::s Ჷ ::t E ::comment Georgian Mtavruli capital letter yn (schwa)
|
707 |
+
::s Ჸ ::t A ::comment Georgian Mtavruli capital letter elifi
|
708 |
+
::s Ჹ ::t G ::comment Georgian Mtavruli capital letter gan
|
709 |
+
::s Ჺ ::t ' ::comment Georgian Mtavruli capital letter ain
|
710 |
+
::s Ჽ ::t E ::comment Georgian Mtavruli capital letter aen
|
711 |
+
::s Ჾ ::t ::comment Georgian Mtavruli capital letter hard sign
|
712 |
+
::s Ჿ ::t W ::comment Georgian Mtavruli capital letter labial sign
|
713 |
+
|
714 |
+
::s ⴀ ::t a ::comment Georgian small letter an
|
715 |
+
::s ⴄ ::t e ::comment Georgian small letter en
|
716 |
+
::s ⴈ ::t i ::comment Georgian small letter in
|
717 |
+
::s ⴍ ::t o ::comment Georgian small letter on
|
718 |
+
::s ⴓ ::t u ::comment Georgian small letter un
|
719 |
+
::s ⴡ ::t ey ::comment archaic Georgian small letter he
|
720 |
+
::s ⴢ ::t i ::comment archaic Georgian small letter hie
|
721 |
+
::s ⴣ :::t w ::comment archaic Georgian small letter we
|
722 |
+
::s ⴤ ::t q ::comment archaic Georgian small letter har
|
723 |
+
::s ⴥ ::t o ::comment archaic Georgian small letter hoe
|
724 |
+
::s ⴧ ::t e ::comment Georgian small letter yn (schwa)
|
725 |
+
::s ⴭ ::t e ::comment Georgian small letter aen
|
726 |
+
|
727 |
+
# Armenian
|
728 |
+
::s Ա ::t A ::comment Armenian capital letter ayb
|
729 |
+
::s ա ::t a ::comment Armenian small letter ayb
|
730 |
+
::s ՠ ::t a ::comment ARMENIAN SMALL LETTER TURNED AYB (CHECK)
|
731 |
+
::s Ե ::t E ::comment Armenian capital letter ech ::dont-use-at-start-of-word
|
732 |
+
::s ե ::t e ::comment Armenian small letter ech ::dont-use-at-start-of-word
|
733 |
+
::s Ե ::t Ye ::comment Armenian capital letter ech ::use-only-at-start-of-word
|
734 |
+
::s ե ::t ye ::comment Armenian small letter ech ::use-only-at-start-of-word
|
735 |
+
::s Է ::t E ::comment Armenian capital letter eh
|
736 |
+
::s է ::t e ::comment Armenian small letter eh
|
737 |
+
::s Ը ::t E ::comment Armenian capital letter et
|
738 |
+
::s ը ::t e ::comment Armenian small letter et
|
739 |
+
::s Ի ::t I ::comment Armenian capital letter ini
|
740 |
+
::s ի ::t i ::comment Armenian small letter ini
|
741 |
+
::s Յ ::t Y ::comment Armenian capital letter yi
|
742 |
+
::s յ ::t y ::comment Armenian small letter yi
|
743 |
+
::s ֈ ::t y ::comment ARMENIAN SMALL LETTER YI WITH STROKE (CHECK)
|
744 |
+
::s Ո ::t Vo ::comment Armenian capital letter vo ::use-only-at-start-of-word
|
745 |
+
::s ո ::t vo ::comment Armenian small letter vo ::use-only-at-start-of-word
|
746 |
+
::s Ո ::t O ::comment Armenian capital letter vo ::dont-use-at-start-of-word
|
747 |
+
::s ո ::t o ::comment Armenian small letter vo ::dont-use-at-start-of-word
|
748 |
+
::s Ւ ::t W ::comment Armenian capital letter yiwn
|
749 |
+
::s ւ ::t w ::comment Armenian small letter yiwn
|
750 |
+
::s Օ ::t O ::comment Armenian capital letter oh
|
751 |
+
::s օ ::t o ::comment Armenian small letter oh
|
752 |
+
::s Խ ::t Kh ::comment Armenian capital letter xeh
|
753 |
+
::s խ ::t kh ::comment Armenian small letter xeh
|
754 |
+
|
755 |
+
::s Ժ ::t Zh ::comment Armenian capital letter zhe
|
756 |
+
::s Ղ ::t Gh ::comment Armenian capital letter ghad
|
757 |
+
::s Ճ ::t Tch ::comment Armenian capital letter cheh
|
758 |
+
::s ճ ::t tch ::comment Armenian small letter cheh
|
759 |
+
::s Շ ::t Sh ::comment Armenian capital letter sha
|
760 |
+
::s Չ ::t Ch ::comment Armenian capital letter cha
|
761 |
+
::s Ջ ::t J ::comment Armenian capital letter jheh
|
762 |
+
::s ջ ::t j ::comment Armenian small letter jheh
|
763 |
+
::s Վ ::t V ::comment Armenian capital letter vew
|
764 |
+
::s վ ::t v ::comment Armenian small letter vew
|
765 |
+
::s Ձ ::t Dz ::comment Armenian capital letter ja
|
766 |
+
::s ձ ::t dz ::comment Armenian small letter ja
|
767 |
+
::s Ծ ::t Ts ::comment Armenian capital letter ca
|
768 |
+
::s ծ ::t ts ::comment Armenian small letter ca
|
769 |
+
::s Ք ::t K ::t-alt Q ::comment Armenian capital letter keh - sometimes romanized as K' or Q
|
770 |
+
::s ք ::t k ::t-alt q ::comment Armenian small letter keh - sometimes romanized as k' or q
|
771 |
+
|
772 |
+
::s են ::t en ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
773 |
+
::s եմ ::t em ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
774 |
+
::s ենք ::t enk ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
775 |
+
::s ես ::t es ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
776 |
+
::s եք ::t ek ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
777 |
+
|
778 |
+
::s և ::t ev ::comment Armenian small ligature ech yiwn
|
779 |
+
::s ՈՒ ::t U ::comment Armenian capital vo+yiwn
|
780 |
+
::s Ու ::t U ::comment Armenian capital/small vo+yiwn
|
781 |
+
::s ու ::t u ::comment Armenian small vo+wywn
|
782 |
+
|
783 |
+
::s իւ ::t yu
|
784 |
+
|
785 |
+
## Japanese
|
786 |
+
# Katakana
|
787 |
+
::s シ ::t shi
|
788 |
+
::s チ ::t chi
|
789 |
+
::s フ ::t fu
|
790 |
+
::s ジ ::t ji
|
791 |
+
::s ヂ ::t ji
|
792 |
+
::s ヅ ::t zu
|
793 |
+
::s シャ ::t sha
|
794 |
+
::s シュ ::t shu
|
795 |
+
::s ショ ::t sho
|
796 |
+
::s チャ ::t cha
|
797 |
+
::s チェ ::t che
|
798 |
+
::s チュ ::t chu
|
799 |
+
::s チョ ::t cho
|
800 |
+
::s ジャ ::t ja
|
801 |
+
::s ジュ ::t ju
|
802 |
+
::s ジョ ::t jo
|
803 |
+
::s ジェ ::t je
|
804 |
+
::s ヂャ ::t ja
|
805 |
+
::s ヂュ ::t ju
|
806 |
+
::s ヂョ ::t jo
|
807 |
+
::s フェ ::t fe
|
808 |
+
::s ヴェ ::t ve
|
809 |
+
::s フィ ::t fi
|
810 |
+
::s ウィ ::t wi
|
811 |
+
::s ヴィ ::t vi
|
812 |
+
::s ティ ::t ti
|
813 |
+
::s ディ ::t di
|
814 |
+
::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
|
815 |
+
::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
|
816 |
+
::s 𛅤 ::t i ::comment KATAKANA LETTER SMALL WI
|
817 |
+
::s 𛅥 ::t e ::comment KATAKANA LETTER SMALL WE
|
818 |
+
::s 𛅦 ::t o ::comment KATAKANA LETTER SMALL WO
|
819 |
+
# Hiragana
|
820 |
+
::s し ::t shi
|
821 |
+
::s ち ::t chi
|
822 |
+
::s つ ::t tsu
|
823 |
+
::s ふ ::t fu
|
824 |
+
::s を ::t o
|
825 |
+
::s じ ::t ji
|
826 |
+
::s ぢ ::t ji
|
827 |
+
::s づ ::t zu
|
828 |
+
::s しゃ ::t sha
|
829 |
+
::s しゅ ::t shu
|
830 |
+
::s しょ ::t sho
|
831 |
+
::s ちゃ ::t cha
|
832 |
+
::s ちゅ ::t chu
|
833 |
+
::s ちょ ::t cho
|
834 |
+
::s じゃ ::t ja
|
835 |
+
::s じゅ ::t ju
|
836 |
+
::s じょ ::t jo
|
837 |
+
::s ぢゃ ::t ja
|
838 |
+
::s ぢゅ ::t ju
|
839 |
+
::s ぢょ ::t jo
|
840 |
+
::s 𛅐 ::t i ::comment HIRAGANA LETTER SMALL WI
|
841 |
+
::s 𛅑 ::t e ::comment HIRAGANA LETTER SMALL WE
|
842 |
+
::s 𛅒 ::t o ::comment HIRAGANA LETTER SMALL WO
|
843 |
+
::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
|
844 |
+
::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
|
845 |
+
|
846 |
+
::s フ ::t fu ::t-alt f
|
847 |
+
::s キ ::t ki ::t-alt k
|
848 |
+
::s ク ::t ku ::t-alt k
|
849 |
+
::s ラ ::t ra ::t-alt la
|
850 |
+
::s リ ::t ri ::t-alt li
|
851 |
+
::s ル ::t ru ::t-alt lu, l, r
|
852 |
+
::s レ ::t re ::t-alt le
|
853 |
+
::s ロ ::t ro ::t-alt lo
|
854 |
+
::s ム ::t mu ::t-alt m ::example キム = Kim
|
855 |
+
::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
|
856 |
+
::s ス ::t su ::t-alt s
|
857 |
+
::s ト ::t to ::t-alt t
|
858 |
+
::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
|
859 |
+
|
860 |
+
::s ㋿ ::t Reiwa ::comment SQUARE ERA NAME REIWA
|
861 |
+
|
862 |
+
# Chinese
|
863 |
+
::s 邦 ::t bang ::t-alt bon, bum, bun, pon
|
864 |
+
::s 鲍 ::t bao ::t-alt bow
|
865 |
+
::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
|
866 |
+
::s 贝 ::t bei ::t-alt ber
|
867 |
+
::s 本 ::t ben ::t-alt bern, bon, bourn, burn
|
868 |
+
::s 彼得 ::t bide ::t-alt peter, pet
|
869 |
+
::s 伯 ::t bo ::t-alt ber
|
870 |
+
::s 波 ::t bo ::t-alt po
|
871 |
+
::s 布 ::t bu ::t-alt b
|
872 |
+
::s 策 ::t ce ::t-alt tze, tzer
|
873 |
+
::s 曾 ::t ceng ::t-alt tzen, zen
|
874 |
+
::s 彻 ::t che ::t-alt tche
|
875 |
+
::s 茨 ::t ci ::t-alt ts, tz, z
|
876 |
+
::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
|
877 |
+
::s 蒂 ::t di ::t-alt ti, tti
|
878 |
+
::s 丁 ::t ding ::t-alt din, tin
|
879 |
+
::s 顿 ::t dun ::t-alt ton
|
880 |
+
::s 多 ::t duo ::t-alt do, dor, to
|
881 |
+
::s 尔 ::t er ::t-alt l, le, ll, r
|
882 |
+
::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
|
883 |
+
::s 夫 ::t fu ::t-alt f, v, v
|
884 |
+
::s 福 ::t fu ::t-alt faw, for, ford
|
885 |
+
::s 哥 ::t ge ::t-alt go, co
|
886 |
+
::s 戈 ::t ge ::t-alt go
|
887 |
+
::s 各 ::t ge ::t-alt go, co
|
888 |
+
::s 赫 ::t he ::t-alt ch, che, cher, ge
|
889 |
+
::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
|
890 |
+
::s 怀 ::t huai ::t-alt whi, wi, wy
|
891 |
+
::s 惠 ::t hui ::t-alt wha, whea
|
892 |
+
::s 基 ::t ji ::t-alt ki, chi
|
893 |
+
::s 吉 ::t ji ::t-alt gi, gui
|
894 |
+
::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
|
895 |
+
::s 杰 ::t jie ::t-alt ger
|
896 |
+
::s 金 ::t jin ::t-alt kin, gin
|
897 |
+
::s 斤 ::t jin ::t-alt zin
|
898 |
+
::s 康 ::t kang ::t-alt con, corn
|
899 |
+
::s 考 ::t kao ::t-alt cow, cour
|
900 |
+
::s 克 ::t ke ::t-alt k, che, cher
|
901 |
+
::s 科 ::t ke ::t-alt ko
|
902 |
+
::s 拉 ::t la ::t-alt ra ::example Tirana
|
903 |
+
::s 朗 ::t lang ::t-alt lon, ron
|
904 |
+
::s 赖 ::t lai ::t-alt ri
|
905 |
+
::s 劳 ::t lao ::t-alt low
|
906 |
+
::s 勒 ::t lei ::t-alt ler
|
907 |
+
::s 伦 ::t lun ::t-alt lon, ran, ron
|
908 |
+
::s 里 ::t li ::t-alt ri
|
909 |
+
::s 利 ::t li ::t-alt ri ::example Ferrari
|
910 |
+
::s 隆 ::t long ::t-alt lon, lum, lund
|
911 |
+
::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
|
912 |
+
::s 洛 ::t luo ::t-alt lo, low, ro
|
913 |
+
::s 默 ::t mo ::t-alt mer
|
914 |
+
::s 纳 ::t na ::t-alt ne, ner
|
915 |
+
::s 珀 ::t po ::t-alt per
|
916 |
+
::s 奇 ::t qi ::t-alt chi, dge, ge, tch
|
917 |
+
::s 齐 ::t qi ::t-alt tsi, zi
|
918 |
+
::s 乔 ::t qiao ::t-alt jo
|
919 |
+
::s 青 ::t qing ::t-alt tsing
|
920 |
+
::s 琼 ::t qiong ::t-alt jon, jum, jun
|
921 |
+
::s 瑟 ::t se ::t-alt the
|
922 |
+
::s 什 ::t shen ::t-alt sh
|
923 |
+
::s 圣 ::t sheng ::t-alt san, sao, saint
|
924 |
+
::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
|
925 |
+
::s 索 ::t suo ::t-alt tho
|
926 |
+
::s 特 ::t te ::t-alt t
|
927 |
+
::s 翁 ::t weng ::t-alt on
|
928 |
+
::s 沃 ::t wo ::t-alt ver, vo, war, wer
|
929 |
+
::s 乌 ::t wu ::t-alt ou, u
|
930 |
+
::s 希 ::t xi ::t-alt chi, hi, shi
|
931 |
+
::s 西 ::t xi ::t-alt s, si
|
932 |
+
::s 锡 ::t xi ::t-alt ci, si, thi, zi
|
933 |
+
::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
|
934 |
+
::s 香 ::t xiang ::t-alt chan, cham
|
935 |
+
::s 歇 ::t xie ::t-alt she
|
936 |
+
::s 谢 ::t xie ::t-alt che, she
|
937 |
+
::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
|
938 |
+
::s 欣 ::t xin ::t-alt hin, shin
|
939 |
+
::s 休 ::t xiu ::t-alt hu, hue
|
940 |
+
::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
|
941 |
+
::s 许 ::t xu ::t-alt hue, schue
|
942 |
+
::s 逊 ::t xun ::t-alt son
|
943 |
+
::s 耶 ::t ye ::t-alt yer, ier
|
944 |
+
::s 泽 ::t ze ::t-alt ser
|
945 |
+
::s 扎 ::t zha ::t-alt za
|
946 |
+
::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
|
947 |
+
::s 治 ::t zhi ::t-alt ge ::example George
|
948 |
+
|
949 |
+
## Numbers
|
950 |
+
# Chinese and Japanese numbers
|
951 |
+
::s 零 ::num 0
|
952 |
+
::s 〇 ::num 0
|
953 |
+
::s 一 ::num 1
|
954 |
+
::s 二 ::num 2
|
955 |
+
::s 三 ::num 3
|
956 |
+
::s 四 ::num 4
|
957 |
+
::s 五 ::num 5
|
958 |
+
::s 六 ::num 6
|
959 |
+
::s 七 ::num 7
|
960 |
+
::s 八 ::num 8
|
961 |
+
::s 九 ::num 9
|
962 |
+
::s 十 ::num 10
|
963 |
+
::s 百 ::num 100
|
964 |
+
::s 千 ::num 1000
|
965 |
+
::s 万 ::num 10000
|
966 |
+
::s 萬 ::num 10000
|
967 |
+
::s 亿 ::num 100000000
|
968 |
+
::s 億 ::num 100000000
|
969 |
+
::s 兆 ::num 1000000000000
|
970 |
+
::s 京 ::num 10000000000000000
|
971 |
+
|
972 |
+
# numbers in non-number words (to be exptended)
|
973 |
+
::s 一贯 ::t yiguan ::comment consistent
|
974 |
+
|
975 |
+
::s 红十字会 ::t hongshizihui ::comment Red Cross
|
976 |
+
|
977 |
+
::s 百度 ::t baidu ::comment Baidu (company)
|
978 |
+
::s 百分 ::t baifen ::comment percent
|
979 |
+
::s 百合 ::t baihe ::comment lily
|
980 |
+
::s 百货 ::t baihuo ::comment general merchandise
|
981 |
+
::s 百科 ::t baike ::comment encyclopedia
|
982 |
+
::s 百老汇 ::t bailaohui
|
983 |
+
::s 百灵 ::t bailing
|
984 |
+
::s 百慕大 ::t baimuda
|
985 |
+
::s 百日咳 ::t bairike
|
986 |
+
::s 百色市 ::t baiseshi
|
987 |
+
::s 百事可乐 ::t baishikele ::comment Pepsi Cola
|
988 |
+
::s 百無 ::t baiwu
|
989 |
+
::s 百香 ::t baixiang
|
990 |
+
::s 百姓 ::t baixing
|
991 |
+
::s 百叶 ::t baiye
|
992 |
+
::s 百色 ::t bose
|
993 |
+
::s 杨百翰 ::t yangbaihan ::comment Brigham Young
|
994 |
+
|
995 |
+
::s 北京 ::t beijing
|
996 |
+
::s 京都 ::t jingdou
|
997 |
+
::s 东京 ::t dongjing
|
998 |
+
::s 京胡 ::t jinghu
|
999 |
+
::s 南京 ::t nangjing
|
1000 |
+
::s 普京 ::t pujing ::comment Putin
|
1001 |
+
::s 東京 ::t dongjing ::comment Tokyo
|
1002 |
+
::s 京兆 ::t jingzhao
|
1003 |
+
|
1004 |
+
::s ㎢ ::t km²
|
1005 |
+
::s ㎥ ::t m³
|
1006 |
+
::s ㎝ ::t cm
|
1007 |
+
|
1008 |
+
## Indian
|
1009 |
+
# see mostly under UnicodeDataOverwrite.txt
|
1010 |
+
|
1011 |
+
# Malayalam
|
1012 |
+
::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
|
1013 |
+
|
1014 |
+
# Tamil
|
1015 |
+
::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
|
1016 |
+
::s ஃப ::t f ::comment h+p=f
|
1017 |
+
::s ஃஜ ::t z ::comment h+j=z
|
1018 |
+
|
1019 |
+
# Myanmar/Burmese
|
1020 |
+
# ::s ့ ::t ::comment dot below, denotes creaky tone
|
1021 |
+
# ::s း ::t ::comment visarga, denotes high tone
|
1022 |
+
::s ၌ ::t -nai ::comment locative
|
1023 |
+
::s ၍ ::t -jwe ::comment completed
|
1024 |
+
::s ၎ ::t legau ::comment aforementioned
|
1025 |
+
::s ၏ ::t -i ::comment genetive
|
1026 |
+
|
1027 |
+
# Lao
|
1028 |
+
::s ັ ::t a ::comment vowel sign mai kan
|
1029 |
+
::s ົ ::t o ::comment vowel sign mai kon
|
1030 |
+
::s ູ ::t uu ::comment vowel sign uu
|
1031 |
+
::s ຽ ::t y ::comment semivowel sign nyo
|
1032 |
+
::s ຼ ::t l ::comment semivowel sign lo
|
1033 |
+
::s ລ ::t l ::comment lo loot
|
1034 |
+
::s ຣ ::t l ::comment lo ling
|
1035 |
+
::s ໝ ::t m ::comment ho mo
|
1036 |
+
::s ໜ ::n ::comment ho no
|
1037 |
+
::s ຢ ::t y ::comment yo
|
1038 |
+
::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
|
1039 |
+
::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
|
1040 |
+
::s ຯ ::t ... ::comment Lao ellipsis
|
1041 |
+
|
1042 |
+
# Thai
|
1043 |
+
::s ออ ::t o
|
1044 |
+
::s อั ::t a
|
1045 |
+
::s อิ ::t i
|
1046 |
+
::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
|
1047 |
+
|
1048 |
+
# Khmer
|
1049 |
+
::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
|
1050 |
+
::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
|
1051 |
+
::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
|
1052 |
+
::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
|
1053 |
+
::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
|
1054 |
+
::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
|
1055 |
+
|
1056 |
+
## Semitic languages
|
1057 |
+
# Arabic
|
1058 |
+
::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
|
1059 |
+
::s ء ::t ' ::comment hamza
|
1060 |
+
::s ٔ ::t ' ::comment hamza above
|
1061 |
+
::s ٕ ::t ' ::comment hamza below
|
1062 |
+
::s ع ::t ' ::comment ain
|
1063 |
+
::s آ ::t a ::comment alef madda
|
1064 |
+
::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
|
1065 |
+
::s إ ::t i ::comment alef with hamza below
|
1066 |
+
::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
|
1067 |
+
::s ة ::t a ::comment teh marbuta
|
1068 |
+
::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
|
1069 |
+
::s ي ::t y ::comment Arabic yeh
|
1070 |
+
::s ى ::t a ::comment alef maksura
|
1071 |
+
::s ﻯ ::t a ::comment alef maksura isolated form
|
1072 |
+
::s ﻰ ::t a ::comment alef maksura final form
|
1073 |
+
::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
|
1074 |
+
::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
|
1075 |
+
::s ٰ ::t a ::comment Arabic letter superscript alef
|
1076 |
+
::s ـ ::t ::comment tatweel (filler)
|
1077 |
+
::s َ ::t a ::comment fatha ("-a")
|
1078 |
+
::s ُ ::t u ::comment damma ("-u")
|
1079 |
+
::s ِ ::t i ::comment kasra ("-i")
|
1080 |
+
::s ْ ::t ::comment sukun (no vowel)
|
1081 |
+
::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
|
1082 |
+
::s ً ::t ::comment fathatan ("-an")
|
1083 |
+
::s اً ::t an ::comment alef + fathatan
|
1084 |
+
::s ٌ ::t ::comment dammatan ("-un")
|
1085 |
+
::s ٍ ::t ::comment kasratan ("-in")
|
1086 |
+
::s ّ ::t ::comment shadda (consonant doubler)
|
1087 |
+
::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
|
1088 |
+
::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
|
1089 |
+
::s ۾ ::t men ::comment Sindhi postposition men
|
1090 |
+
::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
|
1091 |
+
::s ﷴ ::t mohammad ::comment "Mohammad"
|
1092 |
+
::s ﷸ ::t wasallam ::comment "and peace"
|
1093 |
+
::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
|
1094 |
+
|
1095 |
+
::s ࣓ ::t waw ::comment ARABIC SMALL LOW WAW
|
1096 |
+
::s ࣔ ::t al-rub ::comment ARABIC SMALL HIGH WORD AR-RUB
|
1097 |
+
::s ࣕ ::t s ::comment ARABIC SMALL HIGH SAD
|
1098 |
+
::s ࣖ ::t ' ::comment ARABIC SMALL HIGH AIN
|
1099 |
+
::s ࣗ ::t q ::comment ARABIC SMALL HIGH QAF
|
1100 |
+
::s ࣘ ::t n ::comment ARABIC SMALL HIGH NOON WITH KASRA
|
1101 |
+
::s ࣙ ::t n ::comment ARABIC SMALL LOW NOON WITH KASRA
|
1102 |
+
::s ࣚ ::t al-thalatha ::comment ARABIC SMALL HIGH WORD ATH-THALATHA
|
1103 |
+
::s ࣛ ::t al-sajda ::comment ARABIC SMALL HIGH WORD AS-SAJDA
|
1104 |
+
::s ࣜ ::t al-nisf ::comment ARABIC SMALL HIGH WORD AN-NISF
|
1105 |
+
::s ࣝ ::t sakta ::comment ARABIC SMALL HIGH WORD SAKTA
|
1106 |
+
::s ࣞ ::t qif ::comment ARABIC SMALL HIGH WORD QIF
|
1107 |
+
::s ࣟ ::t waqfa ::comment ARABIC SMALL HIGH WORD WAQFA
|
1108 |
+
::s ࣠ ::t ::comment ARABIC SMALL HIGH FOOTNOTE MARKER (CHECK)
|
1109 |
+
::s ࣡ ::t ::comment ARABIC SMALL HIGH SIGN SAFHA (CHECK)
|
1110 |
+
::s ::t ::comment ARABIC DISPUTED END OF AYAH (CHECK)
|
1111 |
+
|
1112 |
+
# Farsi
|
1113 |
+
::s ی ::t i ::t-alt y ::comment Contributed by Nima
|
1114 |
+
::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
|
1115 |
+
::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
1116 |
+
::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
|
1117 |
+
::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
|
1118 |
+
::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
|
1119 |
+
::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
|
1120 |
+
::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
|
1121 |
+
::s عا ::t a ::lcode fas ::comment Contributed by Nima
|
1122 |
+
::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
|
1123 |
+
::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
|
1124 |
+
::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
|
1125 |
+
::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
1126 |
+
::s ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
|
1127 |
+
::s غ ::t gh ::t-alt g ::lcode fas
|
1128 |
+
::s آئی ::t ai ::t-alt ae ::lcode fas
|
1129 |
+
::s ائی ::t ai ::t-alt ae ::lcode fas
|
1130 |
+
::s آئو ::t au ::t-alt ao ::lcode fas
|
1131 |
+
::s ائو ::t au ::t-alt ao ::lcode fas
|
1132 |
+
|
1133 |
+
# Kashmiri (so far: educated guesses)
|
1134 |
+
::s ٖ ::t a ::comment Arabic subscript alef U+0656
|
1135 |
+
::s ٗ ::t u ::comment Arabic inverted damma U+0657
|
1136 |
+
::s ۚ ::t j ::comment Arabic small high jeem U+06DA
|
1137 |
+
::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
|
1138 |
+
::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
|
1139 |
+
|
1140 |
+
# Pashto
|
1141 |
+
::s ٙ ::t e ::comment Arabic zwarakay
|
1142 |
+
::s ځ ::t z ::t-alt dz ::comment Pashto letter zim; Arabic letter "hah with hamza above"
|
1143 |
+
::s څ ::t ts ::t-alt c ::comment Pashto letter tsim; Arabic letter "h with three dots above"
|
1144 |
+
::s ګ ::t g ::comment Pashto letter gaf; Arabic letter "kaf with ring"
|
1145 |
+
::s ڼ ::t n ::comment Arabic letter "noon with ring"
|
1146 |
+
::s ږ ::t g ::t-alt z, zh, j ::comment pronunciation varies regionally
|
1147 |
+
::s ښ ::t kh ::t-alt sh ::comment pronunciation varies regionally
|
1148 |
+
::s ه ::t h ::t-alt a ::lcode pus
|
1149 |
+
::s ۀ ::t e ::lcode pus ::comment Arabic letter "heh with yeh above"
|
1150 |
+
::s و ::t w ::t-alt o, u ::lcode pus
|
1151 |
+
::s ی ::t ay ::t-alt y ::lcode pus
|
1152 |
+
::s وی ::t wy ::t-alt oy, uy ::lcode pus
|
1153 |
+
::s ای ::t ay ::lcode pus
|
1154 |
+
::s ۍ ::t ay ::lcode pus
|
1155 |
+
::s ئ ::t ay ::t-alt y ::lcode pus
|
1156 |
+
::s ژ ::t zh ::t-alt z ::lcode pus ::comment [ʒ]
|
1157 |
+
::s ض ::t z ::t-alt d ::lcode pus
|
1158 |
+
::s ث ::t s ::lcode pus ::t-alt th ::comment Arabic letter theh (unvoiced th/θ)
|
1159 |
+
::s ذ ::t z ::lcode pus ::t-alt th ::comment Arabic letter thal (voiced th/ð)
|
1160 |
+
|
1161 |
+
# Hebrew
|
1162 |
+
::s ב ::t v ::comment Hebrew letter bet ::t-alt b
|
1163 |
+
::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
|
1164 |
+
::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
|
1165 |
+
::s פ ::t f ::comment Hebrew letter pe ::t-alt p
|
1166 |
+
::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
|
1167 |
+
::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
|
1168 |
+
::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
|
1169 |
+
::s ק ::t q ::t-alt k ::use-alt-in-pointed
|
1170 |
+
::s וֹ ::t o
|
1171 |
+
::s וּ ::t u
|
1172 |
+
::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
|
1173 |
+
::s י ::t y
|
1174 |
+
::s יּ ::t y
|
1175 |
+
::s יָּ ::t ya
|
1176 |
+
::s ײ ::t yy ::comment Hebrew ligature Yiddish double Yod (CHECK)
|
1177 |
+
::s ׯ ::t yyy ::comment HEBREW YOD TRIANGLE (CHECK)
|
1178 |
+
::s ע ::t '
|
1179 |
+
::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
|
1180 |
+
::s ֵי ::t e
|
1181 |
+
::s ִיּ ::t iy
|
1182 |
+
::s ִיָּ ::t iya
|
1183 |
+
::s ױ ::t oy
|
1184 |
+
::s א ::t a ::t-alt '
|
1185 |
+
::s אָ ::t a
|
1186 |
+
::s ֹא ::t o
|
1187 |
+
::s אַ ::t 'a
|
1188 |
+
::s אֲ ::t 'a
|
1189 |
+
::s אֶ ::t e
|
1190 |
+
::s אֱ ::t e
|
1191 |
+
::s פ ::t f
|
1192 |
+
::s פּ ::t p
|
1193 |
+
::s פַּ ::t pa
|
1194 |
+
::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
|
1195 |
+
::s שׁ ::t sh
|
1196 |
+
::s שָׁ ::t sha
|
1197 |
+
::s שָּׁ ::t sha ::comment ?
|
1198 |
+
::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
|
1199 |
+
::s שֶׁ ::t she
|
1200 |
+
::s שִׁ ::t shi
|
1201 |
+
::s שֻׁ ::t shu
|
1202 |
+
::s שׂ ::t s
|
1203 |
+
::s שָׂ ::t sa
|
1204 |
+
::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
|
1205 |
+
::s כּ ::t k
|
1206 |
+
::s כֶּ ::t ke
|
1207 |
+
::s כֹּ ::t ko
|
1208 |
+
::s בּ ::t b
|
1209 |
+
::s בַּ ::t ba
|
1210 |
+
::s בָּ ::t ba
|
1211 |
+
::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
|
1212 |
+
::s בֶּ ::t be
|
1213 |
+
::s תּ ::t t
|
1214 |
+
::s תַּ ::t ta
|
1215 |
+
::s תֵּ ::t te
|
1216 |
+
::s תִּ ::t ti
|
1217 |
+
::s דָּ ::t da
|
1218 |
+
::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
|
1219 |
+
::s גּ ::t g
|
1220 |
+
::s לֵּ ::t le
|
1221 |
+
::s ד׳ ::t dh
|
1222 |
+
::s ג׳ ::t j
|
1223 |
+
::s ת׳ ::t th
|
1224 |
+
::s ז׳ ::t zh
|
1225 |
+
::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
|
1226 |
+
::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
|
1227 |
+
::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
|
1228 |
+
::s ַ ::t a ::comment Hebrew point patah
|
1229 |
+
::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
|
1230 |
+
::s ֳ ::t o ::comment Hebrew point hataf qamats
|
1231 |
+
::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
|
1232 |
+
::s ֶ ::t e ::comment Hebrew point segol
|
1233 |
+
::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
|
1234 |
+
::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
|
1235 |
+
::s ֵ ::t e ::comment Hebrew point tsere
|
1236 |
+
::s ִ ::t i ::comment Hebrew point hiriq
|
1237 |
+
::s ֹ ::t o ::comment Hebrew point holam
|
1238 |
+
::s ֻ ::t u ::comment Hebrew point qubuts
|
1239 |
+
# ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
|
1240 |
+
|
1241 |
+
# Yiddish
|
1242 |
+
::s א ::t a ::lcode yid ::comment called "silent" alef
|
1243 |
+
::s אי ::t y ::lcode yid
|
1244 |
+
::s איי ::t ey ::lcode yid
|
1245 |
+
::s או ::t u ::lcode yid
|
1246 |
+
::s אוי ::t oy ::lcode yid
|
1247 |
+
::s אַ ::t a ::lcode yid
|
1248 |
+
::s אָ ::t o ::lcode yid
|
1249 |
+
::s ב ::t b ::lcode yid
|
1250 |
+
::s בֿ ::t v ::lcode yid
|
1251 |
+
::s דזש ::t dzh ::lcode yid
|
1252 |
+
::s ו ::t u ::lcode yid
|
1253 |
+
::s וּ ::t u ::lcode yid
|
1254 |
+
::s וֹ ::t o ::lcode yid
|
1255 |
+
::s װ ::t v ::lcode yid
|
1256 |
+
::s ווא ::t wa ::lcode yid
|
1257 |
+
::s וואַ ::t wa ::lcode yid
|
1258 |
+
::s ווע ::t we ::lcode yid
|
1259 |
+
::s ווי ::t wi ::lcode yid
|
1260 |
+
::s וואוי ::t wo ::lcode yid
|
1261 |
+
::s וי ::t oy ::lcode yid
|
1262 |
+
::s זש ::t zh ::lcode yid
|
1263 |
+
::s ח ::t ch ::lcode yid
|
1264 |
+
::s טש ::t tsh ::lcode yid
|
1265 |
+
::s יִ::t i ::lcode yid
|
1266 |
+
::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
|
1267 |
+
::s ײַ ::t ay ::lcode yid
|
1268 |
+
::s כּ ::t k ::lcode yid
|
1269 |
+
::s כ ::t ch ::lcode yid
|
1270 |
+
::s ך ::t ch ::lcode yid
|
1271 |
+
::s ע ::t e ::lcode yid
|
1272 |
+
::s פּ ::t p ::lcode yid
|
1273 |
+
::s פֿ ::t f ::lcode yid
|
1274 |
+
::s ף ::t f ::lcode yid ::comment sometimes p
|
1275 |
+
::s ק ::t k ::lcode yid
|
1276 |
+
::s ת ::t s ::lcode yid
|
1277 |
+
|
1278 |
+
# Syriac/Aramaic (should be vetted by expert)
|
1279 |
+
::s ܰ ::t a ::comment Syriac pthaha above
|
1280 |
+
::s ܲ ::t a ::comment Syriac pthaha dotted
|
1281 |
+
::s ܳ ::t aa ::comment Syriac zqapha above
|
1282 |
+
::s ܴ ::t aa ::comment Syriac zqapha below
|
1283 |
+
::s ܵ ::t aa ::comment Syriac zqapha dotted
|
1284 |
+
::s ܶ ::t e ::comment Syriac rbasa above
|
1285 |
+
::s ܷ ::t e ::comment Syriac rbasa below
|
1286 |
+
::s ܿ ::t o ::comment Syriac rwaha
|
1287 |
+
::s ܸ ::t e ::comment Syriac dotted zlama horizontal
|
1288 |
+
::s ܹ ::t e ::comment Syriac dotted zlama angular
|
1289 |
+
::s ܺ ::t i ::comment Syriac hbasa above
|
1290 |
+
::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
|
1291 |
+
::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
|
1292 |
+
::s ܽ ::t o ::comment Syriac esasa above
|
1293 |
+
::s ܾ ::t u ::comment Syriac esasa below
|
1294 |
+
::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
|
1295 |
+
|
1296 |
+
::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
|
1297 |
+
::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
|
1298 |
+
::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
|
1299 |
+
::s ܒ�� ::t v ::comment Syriac beth + ring-below
|
1300 |
+
::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
|
1301 |
+
::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
|
1302 |
+
::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
|
1303 |
+
::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
|
1304 |
+
::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
|
1305 |
+
::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
|
1306 |
+
::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
|
1307 |
+
::s ܦ̥ ::t f ::comment Syriac pe + ring-below
|
1308 |
+
::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
|
1309 |
+
::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
|
1310 |
+
::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
|
1311 |
+
|
1312 |
+
::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
|
1313 |
+
::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
|
1314 |
+
::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
|
1315 |
+
|
1316 |
+
# Uzbek
|
1317 |
+
::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
|
1318 |
+
::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
|
1319 |
+
|
1320 |
+
# Uyghur
|
1321 |
+
::s ئا ::t a ::lcode uig
|
1322 |
+
::s ە ::t e ::lcode uig
|
1323 |
+
::s ئې ::t e ::lcode uig ::latinplus ë
|
1324 |
+
::s ې ::t e ::lcode uig ::latinplus ë
|
1325 |
+
::s ئە ::t e ::lcode uig
|
1326 |
+
::s يە ::t e ::lcode uig
|
1327 |
+
::s ئى ::t i ::lcode uig
|
1328 |
+
::s ى ::t i ::lcode uig
|
1329 |
+
::s ئو ::t o ::lcode uig
|
1330 |
+
::s و ::t o ::lcode uig
|
1331 |
+
::s ئۇ ::t u ::lcode uig
|
1332 |
+
::s ۇ ::t u ::lcode uig
|
1333 |
+
::s چ ::t ch ::t-alt q ::lcode uig
|
1334 |
+
::s خ ::t x ::lcode uig
|
1335 |
+
::s ژ ::t zh ::lcode uig
|
1336 |
+
::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
1337 |
+
::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
1338 |
+
::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
1339 |
+
::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
1340 |
+
::s ۋ ::t w ::lcode uig
|
1341 |
+
|
1342 |
+
# Maldivian
|
1343 |
+
::s ް ::t ::comment thaana sukun
|
1344 |
+
::s ަ ::t a ::comment thaana abafili
|
1345 |
+
::s ާ ::t aa ::comment thaana aabaafili
|
1346 |
+
::s ި ::t i ::comment thaana ibifili
|
1347 |
+
::s ީ ::t ee ::comment thaana eebeefili
|
1348 |
+
::s ު ::t u ::comment thaana ubufili
|
1349 |
+
::s ޫ ::t oo ::comment thaana ooboofili
|
1350 |
+
::s ެ ::t e ::comment thaana ebefili
|
1351 |
+
::s ޭ ::t ey ::comment thaana eybeyfili
|
1352 |
+
::s ޮ ::t o ::comment thaana obofili
|
1353 |
+
::s ޯ ::t oa ::comment thaana oaboafili
|
1354 |
+
|
1355 |
+
# Canadian syllabics (Inuktitut)
|
1356 |
+
::s ᑊ ::t p ::comment syllable final
|
1357 |
+
::s ᐟ ::t t ::comment syllable final
|
1358 |
+
::s ᐠ ::t k ::comment syllable final
|
1359 |
+
::s ᐨ ::t c ::comment syllable final
|
1360 |
+
::s ᒼ ::t m ::comment syllable final
|
1361 |
+
::s ᐣ ::t n ::comment syllable final
|
1362 |
+
::s ᐢ ::t s ::comment syllable final
|
1363 |
+
::s ᐧ ::t y ::comment syllable final
|
1364 |
+
::s ᐤ ::t w ::comment syllable final
|
1365 |
+
::s ᐦ ::t h ::comment syllable final
|
1366 |
+
::s ᕽ ::t hk ::comment syllable final
|
1367 |
+
::s ᓫ ::t l ::comment syllable final
|
1368 |
+
::s ᕑ ::t r ::comment syllable final
|
1369 |
+
|
1370 |
+
# Mongolian
|
1371 |
+
::s ᢅ ::t ::comment MONGOLIAN LETTER ALI GALI BALUDA (CHECK) indicates assimilation
|
1372 |
+
::s ᢆ ::t ::comment MONGOLIAN LETTER ALI GALI THREE BALUDA (CHECK) indicates assimilation
|
1373 |
+
|
1374 |
+
# Tibetan
|
1375 |
+
::s ྅ ::t ::comment TIBETAN MARK PALUTA (CHECK) indicates assimilation
|
1376 |
+
|
1377 |
+
## Punctuation
|
1378 |
+
# delete
|
1379 |
+
::s ¿ ::t "" ::comment inverted question mark
|
1380 |
+
::s ¡ ::t "" ::comment inverted exclamation mark
|
1381 |
+
# decompose double-punctuation
|
1382 |
+
::s ‼ ::t !!
|
1383 |
+
::s ⁇ ::t ??
|
1384 |
+
::s ⁉ ::t !?
|
1385 |
+
::s ⁈ ::t ?!
|
1386 |
+
# preserve
|
1387 |
+
::s ′ ::t ′
|
1388 |
+
::s ∩ ::t ∩
|
1389 |
+
::s ‡ ::t ‡
|
1390 |
+
# Cyrillic
|
1391 |
+
::s ⁙ ::t . ::comment five dot punctuation
|
1392 |
+
# Amharic/Ethiopian
|
1393 |
+
::s ። ::t .
|
1394 |
+
::s ፣ ::t ,
|
1395 |
+
::s ፤ ::t ;
|
1396 |
+
::s ፥ ::t :
|
1397 |
+
::s ፧ ::t ? ::comment Ethiopic question mark
|
1398 |
+
::s ፡ ::t " " ::comment Ethiopic wordspace
|
1399 |
+
::s ፦ ::t : ::comment Ethiopic preface colon
|
1400 |
+
# Ethiopic wordspace often appropriated for other purposes:
|
1401 |
+
::s ፡፡ ::t .
|
1402 |
+
::s ፡- ::t :
|
1403 |
+
::s "፡ " ::t ", "
|
1404 |
+
::s ቸ ::t cha ::comment Ethiopic syllable ca
|
1405 |
+
::s ቹ ::t chu ::comment Ethiopic syllable cu
|
1406 |
+
::s ቺ ::t chi ::comment Ethiopic syllable ci
|
1407 |
+
::s ቻ ::t chaa ::comment Ethiopic syllable caa
|
1408 |
+
::s ቼ ::t chee ::comment Ethiopic syllable cee
|
1409 |
+
::s ች ::t che ::comment Ethiopic syllable ce
|
1410 |
+
::s ቾ ::t cho ::comment Ethiopic syllable co
|
1411 |
+
::s ሠ ::t sa ::comment Ethiopic syllable sza
|
1412 |
+
::s ሡ ::t su ::comment Ethiopic syllable szu
|
1413 |
+
::s ሢ ::t si ::comment Ethiopic syllable szi
|
1414 |
+
::s ሣ ::t saa ::comment Ethiopic syllable szaa
|
1415 |
+
::s ሤ ::t see::comment Ethiopic syllable szee
|
1416 |
+
::s ሥ ::t se ::comment Ethiopic syllable sze
|
1417 |
+
::s ሦ ::t so ::comment Ethiopic syllable szo
|
1418 |
+
::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
|
1419 |
+
::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
|
1420 |
+
::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
|
1421 |
+
::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
|
1422 |
+
::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
|
1423 |
+
::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
|
1424 |
+
::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
|
1425 |
+
|
1426 |
+
# Devanagari (Hindi etc.)
|
1427 |
+
::s । ::t . ::comment danda
|
1428 |
+
::s ॥ ::t . ::comment double danda
|
1429 |
+
::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
|
1430 |
+
::s ॰ ::t . ::comment Devanagari abbreviation sign
|
1431 |
+
# Bengali
|
1432 |
+
::s ৽ ::t . ::comment BENGALI ABBREVIATION SIGN
|
1433 |
+
::s ৾ ::t ::comment BENGALI SANDHI MARK (CHECK)
|
1434 |
+
# Gurmukhi
|
1435 |
+
::s ੶ ::t . ::comment GURMUKHI ABBREVIATION SIGN
|
1436 |
+
# Oriya/Odia (India)
|
1437 |
+
::s ::t . ::comment danda (deprecated, should use Devanagari danda ।)
|
1438 |
+
::s ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
|
1439 |
+
# Tibetan
|
1440 |
+
::s ། ::t ,
|
1441 |
+
::s །: ::t :
|
1442 |
+
::s ༏ ::t ;
|
1443 |
+
::s ༎ ::t .
|
1444 |
+
::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
|
1445 |
+
::s ༼ ::t ( ::comment Tibetan open roof punctuation
|
1446 |
+
::s ༽ ::t ) ::comment Tibetan close roof punctuation
|
1447 |
+
::s ༈ ::t "" ::comment Tibetan mark srbul shad
|
1448 |
+
::s 【 ::t [ ::comment left black lenticular bracket
|
1449 |
+
::s 】 ::t ] ::comment right black lenticular bracket
|
1450 |
+
::s ༄ ::t "" ::comment Tibetan head mark
|
1451 |
+
::s ༄༅ ::t "" ::comment Tibetan head mark
|
1452 |
+
::s ༆ ::t "" ::comment Tibetan head mark
|
1453 |
+
# Myanmar/Burmese
|
1454 |
+
::s ၊ ::t ,
|
1455 |
+
::s ။ ::t .
|
1456 |
+
Khmer
|
1457 |
+
::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
|
1458 |
+
::s ។ ::t . ::comment Khmer sign khan
|
1459 |
+
# Arabic
|
1460 |
+
::s ، ::t ,
|
1461 |
+
::s ؛ ::t ;
|
1462 |
+
::s ٬ ::t ,
|
1463 |
+
::s ۔ ::t .
|
1464 |
+
::s ؟ ::t ?
|
1465 |
+
::s ٪ ::t %
|
1466 |
+
::s ٫ ::t , ::comment Arabic decimal separator
|
1467 |
+
::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
|
1468 |
+
# Aramaic
|
1469 |
+
::s ܀ ::t .
|
1470 |
+
::s ܂ ::t .
|
1471 |
+
# Hebrew
|
1472 |
+
::s ־ ::t - ::comment maqaf
|
1473 |
+
# Armenian
|
1474 |
+
::s ։ ::t .
|
1475 |
+
::s ՝ ::t , ::comment Armenian comma
|
1476 |
+
# Chinese
|
1477 |
+
::s , ::t ", "
|
1478 |
+
::s 、 ::t ", "
|
1479 |
+
::s 。 ::t ". "
|
1480 |
+
::s ! ::t "! "
|
1481 |
+
::s ? ::t "? "
|
1482 |
+
::s 「 ::t ' "'
|
1483 |
+
::s 」 ::t '" '
|
1484 |
+
::s 《 ::t ' "'
|
1485 |
+
::s 》 ::t '" '
|
1486 |
+
::s ( ::t " ("
|
1487 |
+
::s ) ::t ") "
|
1488 |
+
::s ; ::t ;
|
1489 |
+
::s : ::t ": "
|
1490 |
+
::s ︰ ::t ": "
|
1491 |
+
::s - ::t -
|
1492 |
+
::s / ::t /
|
1493 |
+
::s = ::t =
|
1494 |
+
::s ~ ::t ~
|
1495 |
+
::s & ::t &
|
1496 |
+
::s < ::t <
|
1497 |
+
::s > ::t >
|
1498 |
+
::s % ::t %
|
1499 |
+
::s _ ::t _ ::comment FULLWIDTH LOW LINE (U+FF3F)
|
1500 |
+
::s { ::t { ::comment FULLWIDTH LEFT CURLY BRACKET (U+FF5B)
|
1501 |
+
::s } ::t } ::comment FULLWIDTH RIGHT CURLY BRACKET (U+FF5D)
|
1502 |
+
::s ::t " " ::comment ideographic space
|
1503 |
+
# Japanese
|
1504 |
+
::s 『 ::t ' "'
|
1505 |
+
::s 』 ::t '" '
|
1506 |
+
::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
|
1507 |
+
# N'ko
|
1508 |
+
::s ߽ ::t . ::comment NKO DANTAYALAN used to abbreviate units of measure
|
1509 |
+
# Medefaidrin
|
1510 |
+
::s 𖺗 ::t , ::comment MEDEFAIDRIN COMMA
|
1511 |
+
::s 𖺘 ::t . ::comment MEDEFAIDRIN FULL STOP
|
1512 |
+
# Khitan
|
1513 |
+
::s 𖿤 ::t ::comment KHITAN SMALL SCRIPT FILLER
|
1514 |
+
|
1515 |
+
# Symbols
|
1516 |
+
::s ∞ ::t ∞ ::comment infinity
|
1517 |
+
::s ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
|
1518 |
+
::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
|
1519 |
+
::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
|
1520 |
+
::s ﹐ ::t , ::comment small comma; map to regular comma
|
1521 |
+
::s ˚ ::t ° ::comment ring above; map to degree sign
|
1522 |
+
::s ⇒ ::t ⇒ ::comment rightwards double arrow
|
1523 |
+
::s † ::t † ::comment dagger
|
1524 |
+
::s • ::t • ::comment bullet
|
1525 |
+
::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
|
1526 |
+
::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
|
1527 |
+
::s ― ::t ― ::comment horizontal bar
|
1528 |
+
::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
|
1529 |
+
::s ″ ::t ″ ::comment double prime
|
1530 |
+
::s ﴾ ::t ( ::comment ornate left parenthesis
|
1531 |
+
::s ﴿ ::t ) ::comment ornate right parenthesis
|
1532 |
+
::s 〔 ::t [ ::comment left tortoise shell bracket
|
1533 |
+
::s 〕 ::t ] ::comment right tortoise shell bracket
|
1534 |
+
::s ﹝ ::t ( ::comment small left tortoise shell bracket
|
1535 |
+
::s ﹞ ::t ) ::comment small left tortoise shell bracket
|
1536 |
+
::s ¦ ::t ¦ ::comment BROKEN BAR (U+00A6)
|
1537 |
+
::s ¨ ::t ::comment DIAERESIS (U+00A8)
|
1538 |
+
::s ¯ ::t ::comment MACRON (U+00AF)
|
1539 |
+
::s ¸ ::t ::comment CEDILLA (U+00B8)
|
1540 |
+
::s Ƿ ::t W ::comment LATIN CAPITAL LETTER WYNN (U+01F7)
|
1541 |
+
::s ˘ ::t ::comment BREVE (U+02D8)
|
1542 |
+
::s ˛ ::t ::comment OGONEK (U+02DB)
|
1543 |
+
::s ˜ ::t ~ ::comment SMALL TILDE (U+02DC)
|
1544 |
+
::s ̒ ::t ::comment COMBINING TURNED COMMA ABOVE (U+0312)
|
1545 |
+
::s ̔ ::t ::comment COMBINING REVERSED COMMA ABOVE (U+0314)
|
1546 |
+
::s ̜ ::t ::comment COMBINING LEFT HALF RING BELOW (U+031C)
|
1547 |
+
::s ̧ ::t ::comment COMBINING CEDILLA (U+0327)
|
1548 |
+
::s ̫ ::t ::comment COMBINING INVERTED DOUBLE ARCH BELOW (U+032B)
|
1549 |
+
::s ̲ ::t ::comment COMBINING LOW LINE (U+0332)
|
1550 |
+
::s ̳ ::t ::comment COMBINING DOUBLE LOW LINE (U+0333)
|
1551 |
+
::s ̹ ::t ::comment COMBINING RIGHT HALF RING BELOW (U+0339)
|
1552 |
+
::s ̺ ::t ::comment COMBINING INVERTED BRIDGE BELOW (U+033A)
|
1553 |
+
::s ̿ ::t ::comment COMBINING DOUBLE OVERLINE (U+033F)
|
1554 |
+
::s ͅ ::t ::comment COMBINING GREEK YPOGEGRAMMENI (U+0345)
|
1555 |
+
::s ͑ ::t ::comment COMBINING LEFT HALF RING ABOVE (U+0351)
|
1556 |
+
::s ͗ ::t ::comment COMBINING RIGHT HALF RING ABOVE (U+0357)
|
1557 |
+
::s ͚ ::t ::comment COMBINING DOUBLE RING BELOW (U+035A)
|
1558 |
+
::s ͜ ::t ::comment COMBINING DOUBLE BREVE BELOW (U+035C)
|
1559 |
+
::s ͝ ::t ::comment COMBINING DOUBLE BREVE (U+035D)
|
1560 |
+
::s ͞ ::t ::comment COMBINING DOUBLE MACRON (U+035E)
|
1561 |
+
::s ͟ ::t ::comment COMBINING DOUBLE MACRON BELOW (U+035F)
|
1562 |
+
::s ͠ ::t ::comment COMBINING DOUBLE TILDE (U+0360)
|
1563 |
+
|
1564 |
+
::s ‐ ::t - ::comment HYPHEN (U+2010)
|
1565 |
+
::s ‗ ::t ‗ ::comment DOUBLE LOW LINE (U+2017)
|
1566 |
+
::s ‵ ::t ‵ ::comment REVERSED PRIME (U+2035)
|
1567 |
+
::s ‶ ::t ‶ ::comment REVERSED DOUBLE PRIME (U+2036)
|
1568 |
+
::s ‸ ::t ‸ ::comment CARET (U+2038)
|
1569 |
+
::s ‽ ::t ?! ::comment INTERROBANG (U+203D)
|
1570 |
+
::s ‾ ::t ‾ ::comment OVERLINE (U+203E)
|
1571 |
+
::s ‿ ::t ‿ ::comment UNDERTIE (U+203F)
|
1572 |
+
::s ⁂ ::t ⁂ ::comment ASTERISM (U+2042)
|
1573 |
+
::s ⁎ ::t * ::comment LOW ASTERISK (U+204E)
|
1574 |
+
::s ⁏ ::t ; ::comment REVERSED SEMICOLON (U+204F)
|
1575 |
+
::s ⁔ ::t ⁔ ::comment INVERTED UNDERTIE (U+2054)
|
1576 |
+
::s ⁝ ::t ⁝ ::comment TRICOLON (U+205D)
|
1577 |
+
::s ::t " " ::comment MEDIUM MATHEMATICAL SPACE (U+205F)
|
1578 |
+
::s ₋ ::t - ::comment SUBSCRIPT MINUS (U+208B)
|
1579 |
+
::s ⃩ ::t ::comment COMBINING WIDE BRIDGE ABOVE (U+20E9)
|
1580 |
+
|
1581 |
+
::s ﹔ ::t ; ::comment SMALL SEMICOLON (U+FE54)
|
1582 |
+
::s ﹕ ::t : ::comment SMALL COLON (U+FE55)
|
1583 |
+
::s ﹛ ::t { ::comment SMALL LEFT CURLY BRACKET (U+FE5B)
|
1584 |
+
::s ﹜ ::t } ::comment SMALL RIGHT CURLY BRACKET (U+FE5C)
|
1585 |
+
::s ﹠ ::t & ::comment SMALL AMPERSAND (U+FE60)
|
1586 |
+
::s ﹡ ::t * ::comment SMALL ASTERISK (U+FE61)
|
1587 |
+
::s ﹣ ::t - ::comment SMALL HYPHEN-MINUS (U+FE63)
|
1588 |
+
|
1589 |
+
::s ℈ ::t ℈ ::comment SCRUPLE (U+2108)
|
1590 |
+
::s ℟ ::t ℟ ::comment RESPONSE (U+211F)
|
1591 |
+
::s ℣ ::t ℣ ::comment VERSICLE (U+2123)
|
1592 |
+
::s ℽ ::t ℽ ::comment DOUBLE-STRUCK SMALL GAMMA (U+213D)
|
1593 |
+
::s ℾ ::t ℾ ::comment DOUBLE-STRUCK CAPITAL GAMMA (U+213E)
|
1594 |
+
::s ⅋ ::t ⅋ ::comment TURNED AMPERSAND (U+214B)
|
1595 |
+
::s ⅍ ::t A/S::comment AKTIESELSKAB (U+214D)
|
1596 |
+
|
1597 |
+
::s ⑃ ::t ⑃ ::comment OCR INVERTED FORK (U+2443)
|
1598 |
+
::s ⑊ ::t \\ ::comment OCR DOUBLE BACKSLASH (U+244A)
|
1599 |
+
::s ⟮ ::t ( ::comment MATHEMATICAL LEFT FLATTENED PARENTHESIS (U+27EE)
|
1600 |
+
::s ⟯ ::t ) ::comment MATHEMATICAL RIGHT FLATTENED PARENTHESIS (U+27EF)
|
1601 |
+
::s ⸨ ::t (( ::comment LEFT DOUBLE PARENTHESIS (U+2E28)
|
1602 |
+
::s ⸩ ::t )) ::comment RIGHT DOUBLE PARENTHESIS (U+2E29)
|
1603 |
+
|
1604 |
+
# kavyka indicates alternative reading
|
1605 |
+
::s ᷶ ::t ::comment COMBINING KAVYKA ABOVE RIGHT (U+1DF6)
|
1606 |
+
::s ᷷ ::t ::comment COMBINING KAVYKA ABOVE LEFT (U+1DF7)
|
1607 |
+
::s ⹅ ::t ::comment INVERTED LOW KAVYKA (U+2E45)
|
1608 |
+
::s ⹆ ::t ::comment INVERTED LOW KAVYKA WITH KAVYKA ABOVE (U+2E46)
|
1609 |
+
::s ⹇ ::t ::comment LOW KAVYKA (U+2E47)
|
1610 |
+
::s ⹈ ::t ::comment LOW KAVYKA WITH DOT (U+2E48)
|
1611 |
+
::s ꙾ ::t ::comment CYRILLIC KAVYKA (U+A67E)
|
1612 |
+
|
1613 |
+
# Braille
|
1614 |
+
::s ⠁ ::t a
|
1615 |
+
::s ⠃ ::t b
|
1616 |
+
::s ⠉ ::t c
|
1617 |
+
::s ⠙ ::t d
|
1618 |
+
::s ⠑ ::t e
|
1619 |
+
::s ⠋ ::t f
|
1620 |
+
::s ⠛ ::t g
|
1621 |
+
::s ⠓ ::t h
|
1622 |
+
::s ⠊ ::t i
|
1623 |
+
::s ⠚ ::t j
|
1624 |
+
::s ⠅ ::t k
|
1625 |
+
::s ⠇ ::t l
|
1626 |
+
::s ⠍ ::t m
|
1627 |
+
::s ⠝ ::t n
|
1628 |
+
::s ⠕ ::t o
|
1629 |
+
::s ⠏ ::t p
|
1630 |
+
::s ⠟ ::t q
|
1631 |
+
::s ⠗ ::t r
|
1632 |
+
::s ⠎ ::t s
|
1633 |
+
::s ⠞ ::t t
|
1634 |
+
::s ⠥ ::t u
|
1635 |
+
::s ⠧ ::t v
|
1636 |
+
::s ⠺ ::t w
|
1637 |
+
::s ⠭ ::t x
|
1638 |
+
::s ⠽ ::t y
|
1639 |
+
::s ⠵ ::t z
|
1640 |
+
|
1641 |
+
::s ⠜ ::t ae
|
1642 |
+
::s ⠪ ::t oe
|
1643 |
+
::s ⠳ ::t ue
|
1644 |
+
::s ⠷ ::t a ::comment à
|
1645 |
+
::s ⠡ ::t a ::comment â
|
1646 |
+
::s ⠿ ::t e ::comment é
|
1647 |
+
::s ⠮ ::t e ::comment è
|
1648 |
+
::s ⠣ ::t e ::comment ê
|
1649 |
+
::s ⠫ ::t e ::comment ë
|
1650 |
+
::s ⠩ ::t i ::comment î
|
1651 |
+
::s ⠻ ::t i ::comment ï
|
1652 |
+
::s ⠹ ::t o ::comment ô
|
1653 |
+
::s ⠾ ::t u ::comment ù
|
1654 |
+
::s ⠱ ::t u ::comment û
|
1655 |
+
|
1656 |
+
::s ⠡ ::t au ::lcode deu
|
1657 |
+
::s ⠌ ::t aeu ::lcode deu
|
1658 |
+
::s ⠹ ::t ch ::lcode deu
|
1659 |
+
::s ⠩ ::t ei ::lcode deu
|
1660 |
+
::s ⠣ ::t eu ::lcode deu
|
1661 |
+
::s ⠬ ::t ie ::lcode deu
|
1662 |
+
::s ⠱ ::t sch ::lcode deu
|
1663 |
+
::s ⠮ ::t ss ::lcode deu
|
1664 |
+
::s ⠾ ::t st ::lcode deu
|
1665 |
+
|
1666 |
+
::s ⠠⠠ ::t "" ::comment start of word all-caps mode
|
1667 |
+
# ::s ⠠⠁ ::t A
|
1668 |
+
# ::s ⠠⠃ ::t B
|
1669 |
+
# ::s ⠠⠉ ::t C
|
1670 |
+
# ::s ⠠⠙ ::t D
|
1671 |
+
# ::s ⠠⠑ ::t E
|
1672 |
+
# ::s ⠠⠋ ::t F
|
1673 |
+
# ::s ⠠⠛ ::t G
|
1674 |
+
# ::s ⠠⠓ ::t H
|
1675 |
+
# ::s ⠠⠊ ::t I
|
1676 |
+
# ::s ⠠⠚ ::t J
|
1677 |
+
# ::s ⠠⠅ ::t K
|
1678 |
+
# ::s ⠠⠇ ::t L
|
1679 |
+
# ::s ⠠⠍ ::t M
|
1680 |
+
# ::s ⠠⠝ ::t N
|
1681 |
+
# ::s ⠠⠕ ::t O
|
1682 |
+
# ::s ⠠⠏ ::t P
|
1683 |
+
# ::s ⠠⠟ ::t Q
|
1684 |
+
# ::s ⠠⠗ ::t R
|
1685 |
+
# ::s ⠠⠎ ::t S
|
1686 |
+
# ::s ⠠⠞ ::t T
|
1687 |
+
# ::s ⠠⠥ ::t U
|
1688 |
+
# ::s ⠠⠧ ::t V
|
1689 |
+
# ::s ⠠⠺ ::t W
|
1690 |
+
# ::s ⠠⠭ ::t X
|
1691 |
+
# ::s ⠠⠽ ::t Y
|
1692 |
+
# ::s ⠠⠵ ::t Z
|
1693 |
+
|
1694 |
+
::s ⠼⠁ ::t 1
|
1695 |
+
::s ⠼⠃ ::t 2
|
1696 |
+
::s ⠼⠉ ::t 3
|
1697 |
+
::s ⠼⠙ ::t 4
|
1698 |
+
::s ⠼⠑ ::t 5
|
1699 |
+
::s ⠼⠋ ::t 6
|
1700 |
+
::s ⠼⠛ ::t 7
|
1701 |
+
::s ⠼⠓ ::t 8
|
1702 |
+
::s ⠼⠊ ::t 9
|
1703 |
+
::s ⠼⠚ ::t 0
|
1704 |
+
|
1705 |
+
::s ⠂ ::t ,
|
1706 |
+
::s ⠆ ::t ;
|
1707 |
+
::s ⠒ ::t :
|
1708 |
+
::s ⠲ ::t .
|
1709 |
+
::s ⠦ ::t ?
|
1710 |
+
::s ⠖ ::t !
|
1711 |
+
::s ⠄ ::t '
|
1712 |
+
::s ⠤ ::t -
|
1713 |
+
::s ⠨⠤ ::t _
|
1714 |
+
|
1715 |
+
::s ⠀ ::t " " ::comment blank
|
1716 |
+
# ::s ⠐ t " " ::comment blank in numeric mode
|
1717 |
+
::s ⠈ ::t "" ::comment accent
|
1718 |
+
# ::s ⠌ ::t / ::comment in numeric mode only
|
1719 |
+
# ::s ⠐ ::comment abbreviation sign
|
1720 |
+
# ::s ⠘ ::comment abbreviation sign
|
1721 |
+
# ::s ⠠ ::comment capital indicator
|
1722 |
+
::s ⠨ ::t . ::comment decimal point; emphasis
|
1723 |
+
::s ⠰ ::t "" ::comment letter indicator
|
1724 |
+
# ::s ⠴ ::t ”
|
1725 |
+
# ::s ⠶ ::t ()
|
1726 |
+
# ::s ⠸ ::comment abbreviation sign
|
1727 |
+
::s ⠼ ::t "" ::comment number indicator
|
1728 |
+
::s ⠘⠚ ::t ° ::word-external-punctuation
|
1729 |
+
::s ⠘⠚⠠⠉ ::t °C
|
1730 |
+
::s ⠘⠚⠉ ::t °C
|
1731 |
+
::s ⠘⠚⠠⠋ ::t °F
|
1732 |
+
::s ⠘⠚⠋ ::t °F
|
1733 |
+
|
1734 |
+
::s ⠠⠶ ::t " ::word-external-punctuation
|
1735 |
+
::s ⠘⠦ ::t “ ::word-external-punctuation
|
1736 |
+
::s ⠘⠴ ::t ” ::word-external-punctuation
|
1737 |
+
::s ⠄⠦ ::t ‘
|
1738 |
+
::s ⠄⠴ ::t ’
|
1739 |
+
::s ⠠⠴ ::t ���
|
1740 |
+
::s ⠐⠣ ::t ( ::word-external-punctuation
|
1741 |
+
::s ⠐⠜ ::t ) ::word-external-punctuation
|
1742 |
+
::s ⠨⠣ ::t [ ::word-external-punctuation
|
1743 |
+
::s ⠨⠜ ::t ] ::word-external-punctuation
|
1744 |
+
::s ⠸⠣ ::t { ::word-external-punctuation
|
1745 |
+
::s ⠸⠜ ::t } ::word-external-punctuation
|
1746 |
+
::s ⠈⠣ ::t < ::word-external-punctuation
|
1747 |
+
::s ⠈⠜ ::t > ::word-external-punctuation
|
1748 |
+
::s ⠸⠌ ::t / ::word-external-punctuation
|
1749 |
+
::s ⠸⠡ ::t \ ::word-external-punctuation
|
1750 |
+
::s ⠠⠤ ::t – ::word-external-punctuation
|
1751 |
+
::s ⠐⠠⠤ ::t — ::word-external-punctuation
|
1752 |
+
::s ⠈⠯ ::t & ::word-external-punctuation
|
1753 |
+
::s ⠐⠔ ::t * ::word-external-punctuation
|
1754 |
+
::s ⠨⠦ ::t ∩ ::word-external-punctuation
|
1755 |
+
::s ⠨⠴ ::t % ::word-external-punctuation
|
1756 |
+
::s ⠐⠖ ::t + ::word-external-punctuation
|
1757 |
+
::s ⠐⠤ ::t − ::word-external-punctuation
|
1758 |
+
::s ⠐⠶ ::t = ::word-external-punctuation
|
1759 |
+
::s ⠈⠎ ::t $ ::word-external-punctuation
|
1760 |
+
::s ⠈⠉ ::t ¢ ::word-external-punctuation
|
1761 |
+
::s ⠈⠇ ::t £ ::word-external-punctuation
|
1762 |
+
::s ⠈⠽ ::t ¥ ::word-external-punctuation
|
1763 |
+
::s ⠈⠁ ::t @ ::word-external-punctuation
|
1764 |
+
::s ⠸⠹ ::t # ::word-external-punctuation
|
1765 |
+
::s ⠸⠲ ::t • ::word-external-punctuation
|
1766 |
+
::s ⠈⠢ ::t ^ ::word-external-punctuation
|
1767 |
+
::s ⠈⠔ ::t ~ ::word-external-punctuation
|
1768 |
+
::s ⠘⠉ ::t © ::word-external-punctuation
|
1769 |
+
::s ⠐⠌ ::t ÷ ::word-external-punctuation
|
1770 |
+
::s ⠐⠦ ::t × ::word-external-punctuation
|
1771 |
+
::s ⠈⠠⠹ ::t † ::word-external-punctuation
|
1772 |
+
::s ⠈⠠⠻ ::t ‡ ::word-external-punctuation
|
1773 |
+
::s ⠘⠏ ::t ¶ ::word-external-punctuation
|
1774 |
+
::s ⠘⠎ ::t § ::word-external-punctuation
|
1775 |
+
::s ⠘⠗ ::t ® ::word-external-punctuation
|
1776 |
+
::s ⠘⠞ ::t ™ ::word-external-punctuation
|
1777 |
+
|
1778 |
+
# English Braille
|
1779 |
+
::s ⠁⠃ ::t about ::lcode eng ::use-only-for-whole-word
|
1780 |
+
::s ⠁⠃⠧ ::t above ::lcode eng ::use-only-for-whole-word
|
1781 |
+
::s ⠁⠉ ::t according ::lcode eng ::use-only-for-whole-word
|
1782 |
+
::s ⠁⠉⠗ ::t across ::lcode eng ::use-only-for-whole-word
|
1783 |
+
::s ⠁⠋ ::t after ::lcode eng ::use-only-for-whole-word
|
1784 |
+
::s ⠁⠋⠝ ::t afternoon ::lcode eng ::use-only-for-whole-word
|
1785 |
+
::s ⠁⠋⠺ ::t afterward ::lcode eng ::use-only-for-whole-word
|
1786 |
+
::s ⠁⠛ ::t again ::lcode eng ::use-only-for-whole-word
|
1787 |
+
::s ⠁⠛⠌ ::t against ::lcode eng ::use-only-for-whole-word
|
1788 |
+
::s ⠠⠽ ::t ally ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
|
1789 |
+
::s ⠁⠇⠍ ::t almost ::lcode eng ::use-only-for-whole-word
|
1790 |
+
::s ⠁⠇⠗ ::t already ::lcode eng ::use-only-for-whole-word
|
1791 |
+
::s ⠁⠇ ::t also ::lcode eng ::use-only-for-whole-word
|
1792 |
+
::s ⠁⠇⠹ ::t although ::lcode eng ::use-only-for-whole-word
|
1793 |
+
::s ⠁⠇⠞ ::t altogether ::lcode eng ::use-only-for-whole-word
|
1794 |
+
::s ⠁⠇⠺ ::t always ::lcode eng ::use-only-for-whole-word
|
1795 |
+
::s ⠨⠑ ::t ance ::lcode eng
|
1796 |
+
::s ⠯ ::t and ::lcode eng
|
1797 |
+
::s ⠜ ::t ar ::lcode eng
|
1798 |
+
::s ⠵ ::t as ::lcode eng ::use-only-for-whole-word
|
1799 |
+
::s ⠠⠝ ::t ation ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
|
1800 |
+
::s ⠃ ::t b ::lcode eng
|
1801 |
+
::s ⠆ ::t bb ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1802 |
+
::s ⠆ ::t be ::lcode eng ::use-only-at-start-of-word
|
1803 |
+
::s ⠆⠉ ::t because ::lcode eng ::use-only-for-whole-word
|
1804 |
+
::s ⠆⠋ ::t before ::lcode eng ::use-only-for-whole-word
|
1805 |
+
::s ⠆⠓ ::t behind ::lcode eng ::use-only-for-whole-word
|
1806 |
+
::s ⠆⠇ ::t below ::lcode eng ::use-only-for-whole-word
|
1807 |
+
::s ⠆⠝ ::t beneath ::lcode eng ::use-only-for-whole-word
|
1808 |
+
::s ⠆⠎ ::t beside ::lcode eng ::use-only-for-whole-word
|
1809 |
+
::s ⠆⠞ ::t between ::lcode eng ::use-only-for-whole-word
|
1810 |
+
::s ⠆⠽ ::t beyond ::lcode eng ::use-only-for-whole-word
|
1811 |
+
::s ⠃⠇ ::t blind ::lcode eng ::use-only-for-whole-word
|
1812 |
+
::s ⠃⠗⠇ ::t Braille ::lcode eng ::use-only-for-whole-word
|
1813 |
+
::s ⠃ ::t but ::lcode eng ::use-only-for-whole-word
|
1814 |
+
::s ⠉ ::t c ::lcode eng
|
1815 |
+
::s ⠉ ::t can ::lcode eng ::use-only-for-whole-word
|
1816 |
+
::s ⠸⠉ ::t cannot ::lcode eng
|
1817 |
+
::s ⠒ ::t cc ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1818 |
+
::s ⠉⠧ ::t ceive ::lcode eng ::use-only-at-end-of-word
|
1819 |
+
::s ⠉⠧⠙ ::t ceived ::lcode eng ::use-only-at-end-of-word
|
1820 |
+
::s ⠉⠧⠎ ::t ceives ::lcode eng ::use-only-at-end-of-word
|
1821 |
+
::s ⠉⠧⠛ ::t ceiving ::lcode eng
|
1822 |
+
::s ⠡ ::t ch ::lcode eng
|
1823 |
+
::s ⠐⠡ ::t character ::lcode eng
|
1824 |
+
::s ⠡ ::t child ::lcode eng ::use-only-for-whole-word
|
1825 |
+
::s ⠡⠝ ::t children ::lcode eng ::use-only-for-whole-word
|
1826 |
+
::s ⠒ ::t con ::lcode eng ::use-only-at-start-of-word
|
1827 |
+
::s ⠒ ::t : ::lcode eng ::use-only-at-end-of-word
|
1828 |
+
::s ⠉⠙ ::t could ::lcode eng ::use-only-for-whole-word
|
1829 |
+
::s ⠙ ::t d ::lcode eng
|
1830 |
+
::s ⠙ ::t do ::lcode eng ::use-only-for-whole-word
|
1831 |
+
::s ⠐⠙ ::t day ::lcode eng
|
1832 |
+
# ::s ⠲ ::t dd ::t-alt . ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word ::comment abolished; interferes with period in abbrevisations such as U.S.
|
1833 |
+
::s ⠙⠉⠇ ::t declare ::lcode eng
|
1834 |
+
::s ⠙⠉⠇⠛ ::t declaring ::lcode eng
|
1835 |
+
::s ⠲ ::t dis ::lcode eng ::use-only-at-start-of-word
|
1836 |
+
::s ⠲ ::t . ::lcode eng ::dont-use-at-start-of-word
|
1837 |
+
::s ⠑ ::t e ::lcode eng
|
1838 |
+
::s ⠂ ::t ea ::lcode eng ::dont-use-at-end-of-word
|
1839 |
+
::s ⠂ ::t , ::lcode eng ::use-only-at-end-of-word
|
1840 |
+
::s ⠫ ::t ed ::lcode eng
|
1841 |
+
::s ⠑⠊ ::t either ::lcode eng ::use-only-for-whole-word
|
1842 |
+
::s ⠢ ::t en ::lcode eng
|
1843 |
+
::s ⠰⠑ ::t ence ::lcode eng ::dont-use-at-start-of-word
|
1844 |
+
::s ⠢ ::t enough ::lcode eng ::use-only-for-whole-word
|
1845 |
+
::s ⠻ ::t er ::lcode eng
|
1846 |
+
::s ⠐⠑ ::t ever ::lcode eng
|
1847 |
+
::s ⠑ ::t every ::lcode eng ::use-only-for-whole-word
|
1848 |
+
::s ⠋ ::t f ::lcode eng
|
1849 |
+
::s ⠐⠋ ::t father ::lcode eng
|
1850 |
+
::s ⠖ ::t ff ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1851 |
+
::s ⠋⠌ ::t first ::lcode eng
|
1852 |
+
::s ⠿ ::t for ::lcode eng
|
1853 |
+
::s ⠋⠗ ::t friend ::lcode eng ::use-only-for-whole-word
|
1854 |
+
::s ⠋⠗⠎ ::t friends ::lcode eng ::use-only-for-whole-word
|
1855 |
+
::s ⠋ ::t from ::lcode eng ::use-only-for-whole-word
|
1856 |
+
::s ⠰⠇ ::t ful ::lcode eng ::dont-use-at-start-of-word
|
1857 |
+
::s ⠛ ::t g ::lcode eng
|
1858 |
+
::s ⠶ ::t gg ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1859 |
+
::s ⠣ ::t gh ::lcode eng
|
1860 |
+
::s ⠛ ::t go ::lcode eng ::use-only-for-whole-word
|
1861 |
+
::s ⠛⠙ ::t good ::lcode eng ::use-only-at-start-of-word
|
1862 |
+
::s ⠛⠗⠞ ::t great ::lcode eng
|
1863 |
+
::s ⠓ ::t h ::lcode eng
|
1864 |
+
::s ⠸⠓ ::t had ::lcode eng
|
1865 |
+
::s ⠓ ::t have ::lcode eng ::use-only-for-whole-word
|
1866 |
+
::s ⠐⠓ ::t here ::lcode eng
|
1867 |
+
::s ⠓⠻⠋ ::t herself ::lcode eng ::use-only-for-whole-word
|
1868 |
+
::s ⠓⠍ ::t him ::lcode eng ::use-only-for-whole-word
|
1869 |
+
::s ⠓⠍⠋ ::t himself ::lcode eng ::use-only-for-whole-word
|
1870 |
+
::s ⠦ ::t ? ::lcode eng
|
1871 |
+
::s ⠦ ::t his ::lcode eng ::use-only-for-whole-word
|
1872 |
+
::s ⠊⠍⠍ ::t immediate ::lcode eng ::use-only-for-whole-word
|
1873 |
+
::s ⠊⠍⠍⠇⠽ ::t immediately ::lcode eng ::use-only-for-whole-word
|
1874 |
+
::s ⠔ ::t in ::lcode eng
|
1875 |
+
::s ⠔⠒ ::t incon ::lcode eng ::use-only-at-start-of-word
|
1876 |
+
::s ⠬ ::t ing ::lcode eng
|
1877 |
+
::s ⠭ ::t it ::lcode eng ::use-only-for-whole-word
|
1878 |
+
::s ⠭⠎ ::t its ::lcode eng ::use-only-for-whole-word
|
1879 |
+
::s ⠭⠋ ::t itself ::lcode eng ::use-only-for-whole-word
|
1880 |
+
::s ⠰⠽ ::t ity ::lcode eng ::dont-use-at-start-of-word
|
1881 |
+
::s ⠚ ::t j ::lcode eng
|
1882 |
+
::s ⠚ ::t just ::lcode eng ::use-only-for-whole-word
|
1883 |
+
::s ⠅ ::t k ::lcode eng
|
1884 |
+
::s ⠐⠅ ::t know ::lcode eng
|
1885 |
+
::s ⠅ ::t knowledge ::lcode eng ::use-only-for-whole-word
|
1886 |
+
::s ⠇ ::t l ::lcode eng
|
1887 |
+
::s ⠨⠎ ::t less ::lcode eng ::dont-use-at-start-of-word
|
1888 |
+
::s ⠇⠗ ::t letter ::lcode eng ::use-only-for-whole-word
|
1889 |
+
::s ⠇⠗⠎ ::t letters ::lcode eng ::use-only-for-whole-word
|
1890 |
+
::s ⠇ ::t like ::lcode eng ::use-only-for-whole-word
|
1891 |
+
::s ⠇⠇ ::t little ::lcode eng ::use-only-for-whole-word
|
1892 |
+
::s ⠐⠇ ::t lord ::lcode eng
|
1893 |
+
::s ⠍ ::t m ::lcode eng
|
1894 |
+
::s ⠸⠍ ::t many ::lcode eng
|
1895 |
+
::s ⠰⠞ ::t ment ::lcode eng ::dont-use-at-start-of-word
|
1896 |
+
::s ⠍ ::t more ::lcode eng ::use-only-for-whole-word
|
1897 |
+
::s ⠐⠍ ::t mother ::lcode eng
|
1898 |
+
::s ⠍⠡ ::t much ::lcode eng ::use-only-for-whole-word
|
1899 |
+
::s ⠍⠌ ::t must ::lcode eng ::use-only-for-whole-word
|
1900 |
+
::s ⠍⠽⠋ ::t myself ::lcode eng ::use-only-for-whole-word
|
1901 |
+
::s ⠝ ::t n ::lcode eng
|
1902 |
+
::s ⠐⠝ ::t name ::lcode eng
|
1903 |
+
::s ⠝⠑⠉ ::t necessary ::lcode eng ::use-only-for-whole-word
|
1904 |
+
::s ⠝⠑⠊ ::t neither ::lcode eng ::use-only-for-whole-word
|
1905 |
+
::s ⠰⠎ ::t ness ::lcode eng ::dont-use-at-start-of-word
|
1906 |
+
::s ⠝ ::t not ::lcode eng ::use-only-for-whole-word
|
1907 |
+
::s ⠕⠄⠉ ::t o'clock ::lcode eng ::use-only-for-whole-word
|
1908 |
+
::s ⠷ ::t of ::lcode eng
|
1909 |
+
::s ⠐⠕ ::t one ::lcode eng
|
1910 |
+
::s ⠰⠛ ::t ong ::lcode eng ::dont-use-at-start-of-word
|
1911 |
+
::s ⠳ ::t ou ::lcode eng
|
1912 |
+
::s ⠨⠙ ::t ound ::lcode eng
|
1913 |
+
::s ⠨⠞ ::t ount ::lcode eng
|
1914 |
+
::s ⠐⠳ ::t ought ::lcode eng
|
1915 |
+
::s ⠳⠗⠧⠎ ::t ourselves ::lcode eng ::use-only-for-whole-word
|
1916 |
+
::s ⠳ ::t out ::lcode eng ::use-only-for-whole-word
|
1917 |
+
::s ⠪ ::t ow ::lcode eng
|
1918 |
+
::s ⠏ ::t p ::lcode eng
|
1919 |
+
::s ⠏⠙ ::t paid ::lcode eng ::use-only-for-whole-word
|
1920 |
+
::s ⠐⠏ ::t part ::lcode eng
|
1921 |
+
::s ⠏ ::t people ::lcode eng ::use-only-for-whole-word
|
1922 |
+
::s ⠏⠻⠓ ::t perhaps ::lcode eng ::use-only-for-whole-word
|
1923 |
+
::s ⠟ ::t q ::lcode eng
|
1924 |
+
::s ⠐⠟ ::t question ::lcode eng
|
1925 |
+
::s ⠟⠅ ::t quick ::lcode eng ::use-only-for-whole-word
|
1926 |
+
::s ⠟⠅⠻ ::t quicker ::lcode eng ::use-only-for-whole-word
|
1927 |
+
::s ⠟⠅⠑⠌ ::t quickest ::lcode eng ::use-only-for-whole-word
|
1928 |
+
::s ⠟ ::t quite ::lcode eng ::use-only-for-whole-word
|
1929 |
+
::s ⠗ ::t r ::lcode eng
|
1930 |
+
::s ⠗ ::t rather ::lcode eng ::use-only-for-whole-word
|
1931 |
+
::s ⠐⠗ ::t right ::lcode eng
|
1932 |
+
::s ⠗⠚⠉ ::t rejoice ::lcode eng
|
1933 |
+
::s ⠗⠚⠉⠛ ::t rejoicing ::lcode eng
|
1934 |
+
::s ⠎ ::t s ::lcode eng
|
1935 |
+
::s ⠎⠙ ::t said ::lcode eng ::use-only-for-whole-word
|
1936 |
+
::s ⠩ ::t sh ::lcode eng
|
1937 |
+
::s ⠩ ::t shall ::lcode eng ::use-only-for-whole-word
|
1938 |
+
::s ⠩⠙ ::t should ::lcode eng ::use-only-for-whole-word
|
1939 |
+
::s ⠨⠝ ::t sion ::lcode eng
|
1940 |
+
::s ⠎ ::t so ::lcode eng ::use-only-for-whole-word
|
1941 |
+
::s ⠐⠎ ::t some ::lcode eng
|
1942 |
+
::s ⠸⠎ ::t spirit ::lcode eng
|
1943 |
+
::s ⠌ ::t st ::lcode eng
|
1944 |
+
::s ⠌ ::t still ::lcode eng ::use-only-for-whole-word
|
1945 |
+
::s ⠎⠡ ::t such ::lcode eng ::use-only-for-whole-word
|
1946 |
+
::s ⠞ ::t t ::lcode eng
|
1947 |
+
::s ⠹ ::t th ::lcode eng
|
1948 |
+
::s ⠞ ::t that ::lcode eng ::use-only-for-whole-word
|
1949 |
+
::s ⠹ ::t this ::lcode eng ::use-only-for-whole-word
|
1950 |
+
::s ⠮ ::t the ::lcode eng
|
1951 |
+
::s ⠸⠮ ::t their ::lcode eng
|
1952 |
+
::s ⠮⠍⠧⠎ ::t themselves ::lcode eng ::use-only-for-whole-word
|
1953 |
+
::s ⠐⠮ ::t there ::lcode eng
|
1954 |
+
::s ⠘⠮ ::t these ::lcode eng
|
1955 |
+
::s ⠘⠹ ::t those ::lcode eng
|
1956 |
+
::s ⠐⠹ ::t through ::lcode eng
|
1957 |
+
::s ⠐⠞ ::t time ::lcode eng
|
1958 |
+
::s ⠰⠝ ::t tion ::lcode eng ::dont-use-at-start-of-word
|
1959 |
+
::s ⠖ ::t to ::lcode eng ::use-only-for-whole-word
|
1960 |
+
::s ⠞⠙ ::t today ::lcode eng ::use-only-for-whole-word
|
1961 |
+
::s ⠞⠛⠗ ::t together ::lcode eng ::use-only-for-whole-word
|
1962 |
+
::s ⠞⠍ ::t tomorrow ::lcode eng ::use-only-for-whole-word
|
1963 |
+
::s ⠞⠝ ::t tonight ::lcode eng ::use-only-for-whole-word
|
1964 |
+
::s ⠥ ::t u ::lcode eng
|
1965 |
+
::s ⠥⠝⠒ ::t uncon ::lcode eng ::use-only-at-start-of-word
|
1966 |
+
::s ⠥ ::t us ::lcode eng ::use-only-for-whole-word
|
1967 |
+
::s ⠠⠥⠲⠎⠲ ::t U.S. ::lcode eng
|
1968 |
+
::s ⠐⠥ ::t under ::lcode eng
|
1969 |
+
::s ⠘⠥ ::t upon ::lcode eng
|
1970 |
+
::s ⠧ ::t v ::lcode eng
|
1971 |
+
::s ⠧ ::t very ::lcode eng ::use-only-for-whole-word
|
1972 |
+
::s ⠺ ::t w ::lcode eng
|
1973 |
+
::s ⠴ ::t " ::lcode eng
|
1974 |
+
::s ⠴ ::t was ::lcode eng ::use-only-for-whole-word
|
1975 |
+
::s ⠶ ::t were ::lcode eng ::use-only-for-whole-word
|
1976 |
+
::s ⠱ ::t wh ::lcode eng
|
1977 |
+
::s ⠐⠱ ::t where ::lcode eng
|
1978 |
+
::s ⠱ ::t which ::lcode eng ::use-only-for-whole-word
|
1979 |
+
::s ⠘⠱ ::t whose ::lcode eng
|
1980 |
+
::s ⠺ ::t will ::lcode eng ::use-only-for-whole-word
|
1981 |
+
::s ⠾ ::t with ::lcode eng
|
1982 |
+
::s ⠘⠺ ::t word ::lcode eng
|
1983 |
+
::s ⠐⠺ ::t work ::lcode eng
|
1984 |
+
::s ⠸⠺ ::t world ::lcode eng
|
1985 |
+
::s ⠺⠙ ::t would ::lcode eng ::use-only-for-whole-word
|
1986 |
+
::s ⠭ ::t x ::lcode eng
|
1987 |
+
::s ⠽ ::t y ::lcode eng
|
1988 |
+
::s ⠽ ::t you ::lcode eng ::use-only-for-whole-word
|
1989 |
+
::s ⠽⠗ ::t your ::lcode eng ::use-only-for-whole-word
|
1990 |
+
::s ⠽⠗⠎ ::t yours ::lcode eng ::use-only-for-whole-word
|
1991 |
+
::s ⠽⠗⠋ ::t yourself ::lcode eng ::use-only-for-whole-word
|
1992 |
+
::s ⠽⠗⠧⠎ ::t yourselves ::lcode eng ::use-only-for-whole-word
|
1993 |
+
::s ⠐⠽ ::t young ::lcode eng
|
1994 |
+
::s ⠵ ::t z ::lcode eng
|
1995 |
+
::s ⠠⠴ ::t ’ ::lcode eng
|
1996 |
+
|
1997 |
+
::preserve ::from U+2190 ::to U+21FF ::comments Arrows
|
1998 |
+
::preserve ::from U+2200 ::to U+22FF ::comment Mathematical Operators
|
1999 |
+
::preserve ::from U+2300 ::to U+23FF ::comment Miscellaneous Technical
|
2000 |
+
::preserve ::from U+2500 ::to U+257F ::comment Box Drawing
|
2001 |
+
::preserve ::from U+2580 ::to U+259F ::comment Block Elements
|
2002 |
+
::preserve ::from U+25A0 ::to U+25FF ::comment Geometric Shapes
|
2003 |
+
::preserve ::from U+2600 ::to U+26FF ::comment Miscellaneous Symbols
|
2004 |
+
::preserve ::from U+27C0 ::to U+27ED ::comment Miscellaneous Mathematical Symbols-A
|
2005 |
+
::preserve ::from U+27F0 ::to U+27FF ::comment Supplemental Arrows-A
|
2006 |
+
::preserve ::from U+2900 ::to U+297F ::comment Supplemental Arrows-B
|
2007 |
+
::preserve ::from U+2980 ::to U+29FF ::comment Miscellaneous Mathematical Symbols-B
|
2008 |
+
::preserve ::from U+2A00 ::to U+2AFF ::comment Supplemental Mathematical Operators
|
2009 |
+
::preserve ::from U+2B00 ::to U+2BFF ::comment Miscellaneous Symbols and Arrows
|
2010 |
+
::preserve ::from U+2E00 ::to U+2E27 ::comment Supplemental Punctuation (excluding ⸨⸩)
|
2011 |
+
::preserve ::from U+2E2A ::to U+2E7F ::comment Supplemental Punctuation (cont'd)
|
2012 |
+
::preserve ::from U+18B00 ::to U+18CD5 ::comment Khitan Small Script
|
2013 |
+
::preserve ::from U+1D100 ::to U+1D1FF ::comment Musical Symbols
|
2014 |
+
::preserve ::from U+1D6A8 ::to U+1D7CB ::comment Mathematical Alphanumeric Symbols (Greek)
|
2015 |
+
::preserve ::from U+1D800 ::to U+1DAAF ::comment Sutton SignWriting
|
2016 |
+
::preserve ::from U+1F800 ::to U+1F8FF ::comment Supplemental Arrows-C
|
2017 |
+
::preserve ::from U+1FA00 ::to U+1FA6F ::comment Chess Symbols
|
2018 |
+
::preserve ::from U+1FB00 ::to U+1FBCF ::comment Symbols for Legacy Computing
|
2019 |
+
::preserve ::from U+1FA70 ::to U+1FAFF ::comment Symbols and Pictographs Extended-A
|
uroman/data/romanization-table.v1.2.1.txt
ADDED
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## European Latin extensions
|
3 |
+
# Vowels
|
4 |
+
::s Ä ::t Ae
|
5 |
+
::s Ö ::t Oe
|
6 |
+
::s Ü ::t Ue
|
7 |
+
::s Å ::t Aa
|
8 |
+
::s Æ ::t Ae
|
9 |
+
::s Ø ::t oe
|
10 |
+
::s Œ ::t Oe
|
11 |
+
::s ä ::t ae
|
12 |
+
::s ö ::t oe
|
13 |
+
::s ü ::t ue
|
14 |
+
::s å ::t aa
|
15 |
+
::s æ ::t ae
|
16 |
+
::s ø ::t oe
|
17 |
+
::s œ ::t oe
|
18 |
+
# Consonants
|
19 |
+
::s Ç ::t S
|
20 |
+
::s ç ::t s
|
21 |
+
::s Ç ::t Ch ::lcode tur
|
22 |
+
::s ç ::t ch ::lcode tur
|
23 |
+
::s Ş ::t Sh
|
24 |
+
::s ş ::t sh
|
25 |
+
::s Ș ::t Sh
|
26 |
+
::s ș ::t sh
|
27 |
+
::s ß ::t ss
|
28 |
+
::s Ț ::t Ts
|
29 |
+
::s ț ::t ts
|
30 |
+
|
31 |
+
# Miscellaneous
|
32 |
+
::s ə ::t e
|
33 |
+
|
34 |
+
# English
|
35 |
+
::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
|
36 |
+
::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
|
37 |
+
::s eight ::t eight ::t-alt eit ::example eight, weight
|
38 |
+
::s Eight ::t Eight ::t-alt Eit ::example Eighteen
|
39 |
+
::s ight ::t ight ::t-alt ait ::example Knight
|
40 |
+
::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
|
41 |
+
::s high ::t high ::t-alt hai ::example highlight
|
42 |
+
::s High ::t High ::t-alt Hai ::example High School
|
43 |
+
::s Isle ::t Isle ::t-alt Ail ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Isle
|
44 |
+
::s Island ::t Island ::t-alt Ailand ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Island
|
45 |
+
::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
|
46 |
+
::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
|
47 |
+
::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
|
48 |
+
::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
|
49 |
+
::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
|
50 |
+
::s ph ::t ph ::t-alt f ::example alpha
|
51 |
+
::s Ph ::t Ph ::t-alt F ::example Philip
|
52 |
+
::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
|
53 |
+
::s tion ::t tion ::t-alt shen ::example
|
54 |
+
::s Sean ::t Sean ::t-alt Shawn ::use-only-at-start-of-word ::use-only-at-end-of-word
|
55 |
+
::s ssion ::t ssion ::t-alt shen ::example Sessions
|
56 |
+
::s St ::t St ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
|
57 |
+
::s St. ::t St. ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
|
58 |
+
::s Wr ::t Wr ::t-alt R ::example Wren
|
59 |
+
::s wr ::t wr ::t-alt r ::example Cartwright
|
60 |
+
::s x ::t x ::t-alt ks ::example Mexico
|
61 |
+
::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
|
62 |
+
|
63 |
+
# French
|
64 |
+
::s â ::t a ::t-alt as ::example pâte/paste, pastry
|
65 |
+
::s ê ::t e ::t-alt es ::example fête/feast
|
66 |
+
::s î ::t i ::t-alt is ::example île/isle
|
67 |
+
::s ô ::t o ::t-alt os ::example côte/coast
|
68 |
+
::s û ::t u ::t-alt us ::example août/August
|
69 |
+
::s eaux ::t eaux ::t-alt o ::example Bordeaux
|
70 |
+
::s eau ::t eau ::t-alt o ::example Chateau
|
71 |
+
::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
|
72 |
+
::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
|
73 |
+
::s oux ::t oux ::t-alt u
|
74 |
+
::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
|
75 |
+
|
76 |
+
# German
|
77 |
+
::s Sch ::t Sch ::t-alt Sh
|
78 |
+
::s sch ::t sch ::t-alt sh
|
79 |
+
::s stein ::t stein ::t-alt shtain
|
80 |
+
::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
|
81 |
+
|
82 |
+
# Dutch
|
83 |
+
::s ij ::t ij ::t-alt ai
|
84 |
+
::s Ij ::t Ij ::t-alt Ai
|
85 |
+
|
86 |
+
# Greek
|
87 |
+
::s Ι ::t I
|
88 |
+
::s ι ::t i
|
89 |
+
::s ί ::t i
|
90 |
+
::s ἶ ::t i
|
91 |
+
::s Υ ::t Y
|
92 |
+
::s υ ::t y
|
93 |
+
::s Ρ ::t R
|
94 |
+
::s ρ ::t r
|
95 |
+
::s Ντ ::t D
|
96 |
+
::s ντ ::t nd ::t-alt d
|
97 |
+
# ::s ντζ ::t ntz
|
98 |
+
::s Μπ ::t B
|
99 |
+
::s μπ ::t mb ::t-alt b
|
100 |
+
::s γγ ::t ng
|
101 |
+
::s γκ ::t ng ::t-alt g
|
102 |
+
::s ει ::t ei ::t-alt i
|
103 |
+
::s ου ::t ou ::t-alt u
|
104 |
+
::s χ ::t ch ::t-alt kh
|
105 |
+
|
106 |
+
# Cyrillic
|
107 |
+
::s Г ::t G ::t-alt H
|
108 |
+
::s г ::t g ::t-alt h
|
109 |
+
::s Е ::t E ::t-alt Ye
|
110 |
+
::s е ::t e ::t-alt ye
|
111 |
+
::s Ё ::t E ::t-alt Yo
|
112 |
+
::s ё ::t e ::t-alt yo
|
113 |
+
::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
|
114 |
+
::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
|
115 |
+
::s Щ ::t Shch ::t-alt Sh
|
116 |
+
::s щ ::t shch ::t-alt sh
|
117 |
+
::s Ъ ::t ::comment Cyrillic capital hard sign
|
118 |
+
::s ъ ::t ::comment Cyrillic small hard sign
|
119 |
+
::s Ы ::t Y ::comment Cyrillic capital yeru
|
120 |
+
::s ы ::t y ::comment Cyrillic small yeru
|
121 |
+
::s Ь ::t ::comment Cyrillic capital soft sign
|
122 |
+
::s ь ::t ::comment Cyrillic small soft sign
|
123 |
+
|
124 |
+
::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
|
125 |
+
::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
|
126 |
+
::s Ә ::t e ::comment Cyrillic capital schwa
|
127 |
+
::s ә ::t e ::comment Cyrillic small schwa
|
128 |
+
::s Ӏ ::t ' ::comment Cyrillic palochka
|
129 |
+
::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
|
130 |
+
::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
|
131 |
+
::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
|
132 |
+
::s ӕ ::t ae ::comment Cyrillic small ligature a ie
|
133 |
+
::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
|
134 |
+
::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
|
135 |
+
::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
|
136 |
+
::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
|
137 |
+
|
138 |
+
# Gothic
|
139 |
+
::s 𐌴 ::t e ::comment Gothic letter aihvus
|
140 |
+
::s 𐌹 ::t i ::comment Gothic letter eis
|
141 |
+
::s 𐍇 ::t x ::comment Gothic letter iggws
|
142 |
+
|
143 |
+
# Georgian
|
144 |
+
::s ა ::t a ::comment Georgian letter an
|
145 |
+
::s ე ::t e ::comment Georgian letter en
|
146 |
+
::s ი ::t i ::comment Georgian letter in
|
147 |
+
::s ო ::t o ::comment Georgian letter on
|
148 |
+
::s უ ::t u ::comment Georgian letter un
|
149 |
+
|
150 |
+
# Armenian
|
151 |
+
::s Ա ::t a ::comment Armenian capital letter ayb
|
152 |
+
::s ա ::t a ::comment Armenian small letter ayb
|
153 |
+
::s Ե ::t e ::comment Armenian capital letter ech
|
154 |
+
::s ե ::t e ::comment Armenian small letter ech
|
155 |
+
::s և ::t ev ::comment Armenian small ligature ech yiwn
|
156 |
+
::s Է ::t e ::comment Armenian capital letter eh
|
157 |
+
::s է ::t e ::comment Armenian small letter eh
|
158 |
+
::s Ի ::t i ::comment Armenian capital letter ini
|
159 |
+
::s ի ::t i ::comment Armenian small letter ini
|
160 |
+
::s Օ ::t o ::comment Armenian capital letter oh
|
161 |
+
::s օ ::t o ::comment Armenian small letter oh
|
162 |
+
|
163 |
+
## Japanese
|
164 |
+
# Katakana
|
165 |
+
::s シ ::t shi
|
166 |
+
::s チ ::t chi
|
167 |
+
::s フ ::t fu
|
168 |
+
::s ジ ::t ji
|
169 |
+
::s ヂ ::t ji
|
170 |
+
::s ヅ ::t zu
|
171 |
+
::s シャ ::t sha
|
172 |
+
::s シュ ::t shu
|
173 |
+
::s ショ ::t sho
|
174 |
+
::s チャ ::t cha
|
175 |
+
::s チェ ::t che
|
176 |
+
::s チュ ::t chu
|
177 |
+
::s チョ ::t cho
|
178 |
+
::s ジャ ::t ja
|
179 |
+
::s ジュ ::t ju
|
180 |
+
::s ジョ ::t jo
|
181 |
+
::s ジェ ::t je
|
182 |
+
::s ヂャ ::t ja
|
183 |
+
::s ヂュ ::t ju
|
184 |
+
::s ヂョ ::t jo
|
185 |
+
::s フェ ::t fe
|
186 |
+
::s ヴェ ::t ve
|
187 |
+
::s フィ ::t fi
|
188 |
+
::s ウィ ::t wi
|
189 |
+
::s ヴィ ::t vi
|
190 |
+
::s ティ ::t ti
|
191 |
+
::s ディ ::t di
|
192 |
+
::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
|
193 |
+
::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
|
194 |
+
# Hiragana
|
195 |
+
::s し ::t shi
|
196 |
+
::s ち ::t chi
|
197 |
+
::s つ ::t tsu
|
198 |
+
::s ふ ::t fu
|
199 |
+
::s を ::t o
|
200 |
+
::s じ ::t ji
|
201 |
+
::s ぢ ::t ji
|
202 |
+
::s づ ::t zu
|
203 |
+
::s しゃ ::t sha
|
204 |
+
::s しゅ ::t shu
|
205 |
+
::s しょ ::t sho
|
206 |
+
::s ちゃ ::t cha
|
207 |
+
::s ちゅ ::t chu
|
208 |
+
::s ちょ ::t cho
|
209 |
+
::s じゃ ::t ja
|
210 |
+
::s じゅ ::t ju
|
211 |
+
::s じょ ::t jo
|
212 |
+
::s ぢゃ ::t ja
|
213 |
+
::s ぢゅ ::t ju
|
214 |
+
::s ぢょ ::t jo
|
215 |
+
::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
|
216 |
+
::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
|
217 |
+
|
218 |
+
::s フ ::t fu ::t-alt f
|
219 |
+
::s キ ::t ki ::t-alt k
|
220 |
+
::s ク ::t ku ::t-alt k
|
221 |
+
::s ラ ::t ra ::t-alt la
|
222 |
+
::s リ ::t ri ::t-alt li
|
223 |
+
::s ル ::t ru ::t-alt lu, l, r
|
224 |
+
::s レ ::t re ::t-alt le
|
225 |
+
::s ロ ::t ro ::t-alt lo
|
226 |
+
::s ム ::t mu ::t-alt m ::example キム = Kim
|
227 |
+
::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
|
228 |
+
::s ス ::t su ::t-alt s
|
229 |
+
::s ト ::t to ::t-alt t
|
230 |
+
::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
|
231 |
+
|
232 |
+
# Chinese
|
233 |
+
::s 邦 ::t bang ::t-alt bon, bum, bun, pon
|
234 |
+
::s 鲍 ::t bao ::t-alt bow
|
235 |
+
::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
|
236 |
+
::s 贝 ::t bei ::t-alt ber
|
237 |
+
::s 本 ::t ben ::t-alt bern, bon, bourn, burn
|
238 |
+
::s 彼得 ::t bide ::t-alt peter, pet
|
239 |
+
::s 伯 ::t bo ::t-alt ber
|
240 |
+
::s 波 ::t bo ::t-alt po
|
241 |
+
::s 布 ::t bu ::t-alt b
|
242 |
+
::s 策 ::t ce ::t-alt tze, tzer
|
243 |
+
::s 曾 ::t ceng ::t-alt tzen, zen
|
244 |
+
::s 彻 ::t che ::t-alt tche
|
245 |
+
::s 茨 ::t ci ::t-alt ts, tz, z
|
246 |
+
::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
|
247 |
+
::s 蒂 ::t di ::t-alt ti, tti
|
248 |
+
::s 丁 ::t ding ::t-alt din, tin
|
249 |
+
::s 顿 ::t dun ::t-alt ton
|
250 |
+
::s 多 ::t duo ::t-alt do, dor, to
|
251 |
+
::s 尔 ::t er ::t-alt l, le, ll, r
|
252 |
+
::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
|
253 |
+
::s 夫 ::t fu ::t-alt f, v, v
|
254 |
+
::s 福 ::t fu ::t-alt faw, for, ford
|
255 |
+
::s 哥 ::t ge ::t-alt go, co
|
256 |
+
::s 戈 ::t ge ::t-alt go
|
257 |
+
::s 各 ::t ge ::t-alt go, co
|
258 |
+
::s 赫 ::t he ::t-alt ch, che, cher, ge
|
259 |
+
::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
|
260 |
+
::s 怀 ::t huai ::t-alt whi, wi, wy
|
261 |
+
::s 惠 ::t hui ::t-alt wha, whea
|
262 |
+
::s 基 ::t ji ::t-alt ki, chi
|
263 |
+
::s 吉 ::t ji ::t-alt gi, gui
|
264 |
+
::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
|
265 |
+
::s 杰 ::t jie ::t-alt ger
|
266 |
+
::s 金 ::t jin ::t-alt kin, gin
|
267 |
+
::s 斤 ::t jin ::t-alt zin
|
268 |
+
::s 康 ::t kang ::t-alt con, corn
|
269 |
+
::s 考 ::t kao ::t-alt cow, cour
|
270 |
+
::s 克 ::t ke ::t-alt k, che, cher
|
271 |
+
::s 科 ::t ke ::t-alt ko
|
272 |
+
::s 拉 ::t la ::t-alt ra ::example Tirana
|
273 |
+
::s 朗 ::t lang ::t-alt lon, ron
|
274 |
+
::s 赖 ::t lai ::t-alt ri
|
275 |
+
::s 劳 ::t lao ::t-alt low
|
276 |
+
::s 勒 ::t lei ::t-alt ler
|
277 |
+
::s 伦 ::t lun ::t-alt lon, ran, ron
|
278 |
+
::s 里 ::t li ::t-alt ri
|
279 |
+
::s 利 ::t li ::t-alt ri ::example Ferrari
|
280 |
+
::s 隆 ::t long ::t-alt lon, lum, lund
|
281 |
+
::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
|
282 |
+
::s 洛 ::t luo ::t-alt lo, low, ro
|
283 |
+
::s 默 ::t mo ::t-alt mer
|
284 |
+
::s 纳 ::t na ::t-alt ne, ner
|
285 |
+
::s 珀 ::t po ::t-alt per
|
286 |
+
::s 奇 ::t qi ::t-alt chi, dge, ge, tch
|
287 |
+
::s 齐 ::t qi ::t-alt tsi, zi
|
288 |
+
::s 乔 ::t qiao ::t-alt jo
|
289 |
+
::s 青 ::t qing ::t-alt tsing
|
290 |
+
::s 琼 ::t qiong ::t-alt jon, jum, jun
|
291 |
+
::s 瑟 ::t se ::t-alt the
|
292 |
+
::s 什 ::t shen ::t-alt sh
|
293 |
+
::s 圣 ::t sheng ::t-alt san, sao, saint
|
294 |
+
::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
|
295 |
+
::s 索 ::t suo ::t-alt tho
|
296 |
+
::s 特 ::t te ::t-alt t
|
297 |
+
::s 翁 ::t weng ::t-alt on
|
298 |
+
::s 沃 ::t wo ::t-alt ver, vo, war, wer
|
299 |
+
::s 乌 ::t wu ::t-alt ou, u
|
300 |
+
::s 希 ::t xi ::t-alt chi, hi, shi
|
301 |
+
::s 西 ::t xi ::t-alt s, si
|
302 |
+
::s 锡 ::t xi ::t-alt ci, si, thi, zi
|
303 |
+
::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
|
304 |
+
::s 香 ::t xiang ::t-alt chan, cham
|
305 |
+
::s 歇 ::t xie ::t-alt she
|
306 |
+
::s 谢 ::t xie ::t-alt che, she
|
307 |
+
::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
|
308 |
+
::s 欣 ::t xin ::t-alt hin, shin
|
309 |
+
::s 休 ::t xiu ::t-alt hu, hue
|
310 |
+
::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
|
311 |
+
::s 许 ::t xu ::t-alt hue, schue
|
312 |
+
::s 逊 ::t xun ::t-alt son
|
313 |
+
::s 耶 ::t ye ::t-alt yer, ier
|
314 |
+
::s 泽 ::t ze ::t-alt ser
|
315 |
+
::s 扎 ::t zha ::t-alt za
|
316 |
+
::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
|
317 |
+
::s 治 ::t zhi ::t-alt ge ::example George
|
318 |
+
|
319 |
+
## Numbers
|
320 |
+
# Chinese and Japanese numbers
|
321 |
+
::s 零 ::num 0
|
322 |
+
::s 〇 ::num 0
|
323 |
+
::s 一 ::num 1
|
324 |
+
::s 二 ::num 2
|
325 |
+
::s 三 ::num 3
|
326 |
+
::s 四 ::num 4
|
327 |
+
::s 五 ::num 5
|
328 |
+
::s 六 ::num 6
|
329 |
+
::s 七 ::num 7
|
330 |
+
::s 八 ::num 8
|
331 |
+
::s 九 ::num 9
|
332 |
+
::s 十 ::num 10
|
333 |
+
::s 百 ::num 100
|
334 |
+
::s 千 ::num 1000
|
335 |
+
::s 万 ::num 10000
|
336 |
+
::s 萬 ::num 10000
|
337 |
+
::s 亿 ::num 100000000
|
338 |
+
::s 億 ::num 100000000
|
339 |
+
::s 兆 ::num 1000000000000
|
340 |
+
::s 京 ::num 10000000000000000
|
341 |
+
|
342 |
+
::s 北京 ::t beijing
|
343 |
+
::s 京都 ::t jingdou
|
344 |
+
::s 东京 ::t dongjing
|
345 |
+
::s 京胡 ::t jinghu
|
346 |
+
::s 南京 ::t nangjing
|
347 |
+
::s 普京 ::t pujing ::comment Putin
|
348 |
+
::s 東京 ::t dongjing ::comment Tokyo
|
349 |
+
::s 京兆 ::t jingzhao
|
350 |
+
|
351 |
+
::s ㎢ ::t km²
|
352 |
+
::s ㎥ ::t m³
|
353 |
+
::s ㎝ ::t cm
|
354 |
+
|
355 |
+
## Indian
|
356 |
+
# see mostly under UnicodeDataOverwrite.txt
|
357 |
+
|
358 |
+
# Malayalam
|
359 |
+
::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
|
360 |
+
|
361 |
+
# Tamil
|
362 |
+
::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
|
363 |
+
::s ஃப ::t f ::comment h+p=f
|
364 |
+
::s ஃஜ ::t z ::comment h+j=z
|
365 |
+
|
366 |
+
# Myanmar/Burmese
|
367 |
+
# ::s ့ ::t ::comment dot below, denotes creaky tone
|
368 |
+
# ::s း ::t ::comment visarga, denotes high tone
|
369 |
+
::s ၌ ::t -nai ::comment locative
|
370 |
+
::s ၍ ::t -jwe ::comment completed
|
371 |
+
::s ၎ ::t legau ::comment aforementioned
|
372 |
+
::s ၏ ::t -i ::comment genetive
|
373 |
+
|
374 |
+
# Lao
|
375 |
+
::s ັ ::t a ::comment vowel sign mai kan
|
376 |
+
::s ົ ::t o ::comment vowel sign mai kon
|
377 |
+
::s ູ ::t uu ::comment vowel sign uu
|
378 |
+
::s ຽ ::t y ::comment semivowel sign nyo
|
379 |
+
::s ຼ ::t l ::comment semivowel sign lo
|
380 |
+
::s ລ ::t l ::comment lo loot
|
381 |
+
::s ຣ ::t l ::comment lo ling
|
382 |
+
::s ໝ ::t m ::comment ho mo
|
383 |
+
::s ໜ ::n ::comment ho no
|
384 |
+
::s ຢ ::t y ::comment yo
|
385 |
+
::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
|
386 |
+
::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
|
387 |
+
::s ຯ ::t ... ::comment Lao ellipsis
|
388 |
+
|
389 |
+
# Thai
|
390 |
+
::s ออ ::t o
|
391 |
+
::s อั ::t a
|
392 |
+
::s อิ ::t i
|
393 |
+
::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
|
394 |
+
|
395 |
+
# Khmer
|
396 |
+
::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
|
397 |
+
::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
|
398 |
+
::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
|
399 |
+
::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
|
400 |
+
::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
|
401 |
+
::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
|
402 |
+
|
403 |
+
## Semitic languages
|
404 |
+
# Arabic
|
405 |
+
::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
|
406 |
+
::s ء ::t ' ::comment hamza
|
407 |
+
::s ٔ ::t ' ::comment hamza above
|
408 |
+
::s ٕ ::t ' ::comment hamza below
|
409 |
+
::s ع ::t ' ::comment ain
|
410 |
+
::s آ ::t a ::comment alef madda
|
411 |
+
::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
|
412 |
+
::s إ ::t i ::comment alef with hamza below
|
413 |
+
::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
|
414 |
+
::s ة ::t a ::comment teh marbuta
|
415 |
+
::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
|
416 |
+
::s ي ::t y ::comment Arabic yeh
|
417 |
+
::s ى ::t a ::comment alef maksura
|
418 |
+
::s ﻯ ::t a ::comment alef maksura isolated form
|
419 |
+
::s ﻰ ::t a ::comment alef maksura final form
|
420 |
+
::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
|
421 |
+
::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
|
422 |
+
::s ٰ ::t a ::comment Arabic letter superscript alef
|
423 |
+
::s ـ ::t ::comment tatweel (filler)
|
424 |
+
::s َ ::t a ::comment fatha ("-a")
|
425 |
+
::s ُ ::t u ::comment damma ("-u")
|
426 |
+
::s ِ ::t i ::comment kasra ("-i")
|
427 |
+
::s ْ ::t ::comment sukun (no vowel)
|
428 |
+
::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
|
429 |
+
::s ً ::t ::comment fathatan ("-an")
|
430 |
+
::s اً ::t an ::comment alef + fathatan
|
431 |
+
::s ٌ ::t ::comment dammatan ("-un")
|
432 |
+
::s ٍ ::t ::comment kasratan ("-in")
|
433 |
+
::s ّ ::t ::comment shadda (consonant doubler)
|
434 |
+
::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
|
435 |
+
::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
|
436 |
+
::s ۾ ::t men ::comment Sindhi postposition men
|
437 |
+
::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
|
438 |
+
::s ﷴ ::t mohammad ::comment "Mohammad"
|
439 |
+
::s ﷸ ::t wasallam ::comment "and peace"
|
440 |
+
::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
|
441 |
+
|
442 |
+
# Farsi
|
443 |
+
::s ی ::t i ::t-alt y ::comment Contributed by Nima
|
444 |
+
::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
|
445 |
+
::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
446 |
+
::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
|
447 |
+
::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
|
448 |
+
::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
|
449 |
+
::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
|
450 |
+
::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
|
451 |
+
::s عا ::t a ::lcode fas ::comment Contributed by Nima
|
452 |
+
::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
|
453 |
+
::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
|
454 |
+
::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
|
455 |
+
::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
456 |
+
::s ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
|
457 |
+
::s غ ::t gh ::t-alt g ::lcode fas
|
458 |
+
::s آئی ::t ai ::t-alt ae ::lcode fas
|
459 |
+
::s ائی ::t ai ::t-alt ae ::lcode fas
|
460 |
+
::s آئو ::t au ::t-alt ao ::lcode fas
|
461 |
+
::s ائو ::t au ::t-alt ao ::lcode fas
|
462 |
+
|
463 |
+
# Kashmiri (so far: educated guesses)
|
464 |
+
::s ٖ ::t a ::comment Arabic subscript alef U+0656
|
465 |
+
::s ٗ ::t u ::comment Arabic inverted damma U+0657
|
466 |
+
::s ۚ ::t j ::comment Arabic small high jeem U+06DA
|
467 |
+
::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
|
468 |
+
::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
|
469 |
+
|
470 |
+
# Pashto
|
471 |
+
::s ٙ ::t e
|
472 |
+
|
473 |
+
# Hebrew
|
474 |
+
::s ב ::t v ::comment Hebrew letter bet ::t-alt b
|
475 |
+
::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
|
476 |
+
::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
|
477 |
+
::s פ ::t f ::comment Hebrew letter pe ::t-alt p
|
478 |
+
::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
|
479 |
+
::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
|
480 |
+
::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
|
481 |
+
::s ק ::t q ::t-alt k ::use-alt-in-pointed
|
482 |
+
::s וֹ ::t o
|
483 |
+
::s וּ ::t u
|
484 |
+
::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
|
485 |
+
::s י ::t y
|
486 |
+
::s יּ ::t y
|
487 |
+
::s יָּ ::t ya
|
488 |
+
::s ע ::t '
|
489 |
+
::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
|
490 |
+
::s ֵי ::t e
|
491 |
+
::s ִיּ ::t iy
|
492 |
+
::s ִיָּ ::t iya
|
493 |
+
::s ױ ::t oy
|
494 |
+
::s א ::t a ::t-alt '
|
495 |
+
::s אָ ::t a
|
496 |
+
::s ֹא ::t o
|
497 |
+
::s אַ ::t 'a
|
498 |
+
::s אֲ ::t 'a
|
499 |
+
::s אֶ ::t e
|
500 |
+
::s אֱ ::t e
|
501 |
+
::s פ ::t f
|
502 |
+
::s פּ ::t p
|
503 |
+
::s פַּ ::t pa
|
504 |
+
::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
|
505 |
+
::s שׁ ::t sh
|
506 |
+
::s שָׁ ::t sha
|
507 |
+
::s שָּׁ ::t sha ::comment ?
|
508 |
+
::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
|
509 |
+
::s שֶׁ ::t she
|
510 |
+
::s שִׁ ::t shi
|
511 |
+
::s שֻׁ ::t shu
|
512 |
+
::s שׂ ::t s
|
513 |
+
::s שָׂ ::t sa
|
514 |
+
::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
|
515 |
+
::s כּ ::t k
|
516 |
+
::s כֶּ ::t ke
|
517 |
+
::s כֹּ ::t ko
|
518 |
+
::s בּ ::t b
|
519 |
+
::s בַּ ::t ba
|
520 |
+
::s בָּ ::t ba
|
521 |
+
::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
|
522 |
+
::s בֶּ ::t be
|
523 |
+
::s תּ ::t t
|
524 |
+
::s תַּ ::t ta
|
525 |
+
::s תֵּ ::t te
|
526 |
+
::s תִּ ::t ti
|
527 |
+
::s דָּ ::t da
|
528 |
+
::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
|
529 |
+
::s גּ ::t g
|
530 |
+
::s לֵּ ::t le
|
531 |
+
::s ד׳ ::t dh
|
532 |
+
::s ג׳ ::t j
|
533 |
+
::s ת׳ ::t th
|
534 |
+
::s ז׳ ::t zh
|
535 |
+
::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
|
536 |
+
::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
|
537 |
+
::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
|
538 |
+
::s ַ ::t a ::comment Hebrew point patah
|
539 |
+
::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
|
540 |
+
::s ֳ ::t o ::comment Hebrew point hataf qamats
|
541 |
+
::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
|
542 |
+
::s ֶ ::t e ::comment Hebrew point segol
|
543 |
+
::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
|
544 |
+
::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
|
545 |
+
::s ֵ ::t e ::comment Hebrew point tsere
|
546 |
+
::s ִ ::t i ::comment Hebrew point hiriq
|
547 |
+
::s ֹ ::t o ::comment Hebrew point holam
|
548 |
+
::s ֻ ::t u ::comment Hebrew point qubuts
|
549 |
+
# ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
|
550 |
+
|
551 |
+
# Yiddish
|
552 |
+
::s א ::t a ::lcode yid ::comment called "silent" alef
|
553 |
+
::s אי ::t y ::lcode yid
|
554 |
+
::s איי ::t ey ::lcode yid
|
555 |
+
::s או ::t u ::lcode yid
|
556 |
+
::s אוי ::t oy ::lcode yid
|
557 |
+
::s אַ ::t a ::lcode yid
|
558 |
+
::s אָ ::t o ::lcode yid
|
559 |
+
::s ב ::t b ::lcode yid
|
560 |
+
::s בֿ ::t v ::lcode yid
|
561 |
+
::s דזש ::t dzh ::lcode yid
|
562 |
+
::s ו ::t u ::lcode yid
|
563 |
+
::s וּ ::t u ::lcode yid
|
564 |
+
::s וֹ ::t o ::lcode yid
|
565 |
+
::s װ ::t v ::lcode yid
|
566 |
+
::s ווא ::t wa ::lcode yid
|
567 |
+
::s וואַ ::t wa ::lcode yid
|
568 |
+
::s ווע ::t we ::lcode yid
|
569 |
+
::s ווי ::t wi ::lcode yid
|
570 |
+
::s וואוי ::t wo ::lcode yid
|
571 |
+
::s וי ::t oy ::lcode yid
|
572 |
+
::s זש ::t zh ::lcode yid
|
573 |
+
::s ח ::t ch ::lcode yid
|
574 |
+
::s טש ::t tsh ::lcode yid
|
575 |
+
::s יִ::t i ::lcode yid
|
576 |
+
::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
|
577 |
+
::s ײַ ::t ay ::lcode yid
|
578 |
+
::s כּ ::t k ::lcode yid
|
579 |
+
::s כ ::t ch ::lcode yid
|
580 |
+
::s ך ::t ch ::lcode yid
|
581 |
+
::s ע ::t e ::lcode yid
|
582 |
+
::s פּ ::t p ::lcode yid
|
583 |
+
::s פֿ ::t f ::lcode yid
|
584 |
+
::s ף ::t f ::lcode yid ::comment sometimes p
|
585 |
+
::s ק ::t k ::lcode yid
|
586 |
+
::s ת ::t s ::lcode yid
|
587 |
+
|
588 |
+
# Syriac/Aramaic (should be vetted by expert)
|
589 |
+
::s ܰ ::t a ::comment Syriac pthaha above
|
590 |
+
::s ܲ ::t a ::comment Syriac pthaha dotted
|
591 |
+
::s ܳ ::t aa ::comment Syriac zqapha above
|
592 |
+
::s ܴ ::t aa ::comment Syriac zqapha below
|
593 |
+
::s ܵ ::t aa ::comment Syriac zqapha dotted
|
594 |
+
::s ܶ ::t e ::comment Syriac rbasa above
|
595 |
+
::s ܷ ::t e ::comment Syriac rbasa below
|
596 |
+
::s ܿ ::t o ::comment Syriac rwaha
|
597 |
+
::s ܸ ::t e ::comment Syriac dotted zlama horizontal
|
598 |
+
::s ܹ ::t e ::comment Syriac dotted zlama angular
|
599 |
+
::s ܺ ::t i ::comment Syriac hbasa above
|
600 |
+
::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
|
601 |
+
::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
|
602 |
+
::s ܽ ::t o ::comment Syriac esasa above
|
603 |
+
::s ܾ ::t u ::comment Syriac esasa below
|
604 |
+
::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
|
605 |
+
|
606 |
+
::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
|
607 |
+
::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
|
608 |
+
::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
|
609 |
+
::s ܒ̥ ::t v ::comment Syriac beth + ring-below
|
610 |
+
::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
|
611 |
+
::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
|
612 |
+
::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
|
613 |
+
::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
|
614 |
+
::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
|
615 |
+
::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
|
616 |
+
::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
|
617 |
+
::s ܦ̥ ::t f ::comment Syriac pe + ring-below
|
618 |
+
::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
|
619 |
+
::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
|
620 |
+
::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
|
621 |
+
|
622 |
+
::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
|
623 |
+
::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
|
624 |
+
::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
|
625 |
+
|
626 |
+
# Uzbek
|
627 |
+
::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
|
628 |
+
::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
|
629 |
+
|
630 |
+
# Uyghur
|
631 |
+
::s ئا ::t a ::lcode uig
|
632 |
+
::s ە ::t e ::lcode uig
|
633 |
+
::s ئې ::t e ::lcode uig ::latinplus ë
|
634 |
+
::s ې ::t e ::lcode uig ::latinplus ë
|
635 |
+
::s ئە ::t e ::lcode uig
|
636 |
+
::s يە ::t e ::lcode uig
|
637 |
+
::s ئى ::t i ::lcode uig
|
638 |
+
::s ى ::t i ::lcode uig
|
639 |
+
::s ئو ::t o ::lcode uig
|
640 |
+
::s و ::t o ::lcode uig
|
641 |
+
::s ئۇ ::t u ::lcode uig
|
642 |
+
::s ۇ ::t u ::lcode uig
|
643 |
+
::s چ ::t ch ::t-alt q ::lcode uig
|
644 |
+
::s خ ::t x ::lcode uig
|
645 |
+
::s ژ ::t zh ::lcode uig
|
646 |
+
::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
647 |
+
::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
648 |
+
::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
649 |
+
::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
650 |
+
::s ۋ ::t w ::lcode uig
|
651 |
+
|
652 |
+
# Maldivian
|
653 |
+
::s ް ::t ::comment thaana sukun
|
654 |
+
::s ަ ::t a ::comment thaana abafili
|
655 |
+
::s ާ ::t aa ::comment thaana aabaafili
|
656 |
+
::s ި ::t i ::comment thaana ibifili
|
657 |
+
::s ީ ::t ee ::comment thaana eebeefili
|
658 |
+
::s ު ::t u ::comment thaana ubufili
|
659 |
+
::s ޫ ::t oo ::comment thaana ooboofili
|
660 |
+
::s ެ ::t e ::comment thaana ebefili
|
661 |
+
::s ޭ ::t ey ::comment thaana eybeyfili
|
662 |
+
::s ޮ ::t o ::comment thaana obofili
|
663 |
+
::s ޯ ::t oa ::comment thaana oaboafili
|
664 |
+
|
665 |
+
# Canadian syllabics (Inuktitut)
|
666 |
+
::s ᑊ ::t p ::comment syllable final
|
667 |
+
::s ᐟ ::t t ::comment syllable final
|
668 |
+
::s ᐠ ::t k ::comment syllable final
|
669 |
+
::s ᐨ ::t c ::comment syllable final
|
670 |
+
::s ᒼ ::t m ::comment syllable final
|
671 |
+
::s ᐣ ::t n ::comment syllable final
|
672 |
+
::s ᐢ ::t s ::comment syllable final
|
673 |
+
::s ᐧ ::t y ::comment syllable final
|
674 |
+
::s ᐤ ::t w ::comment syllable final
|
675 |
+
::s ᐦ ::t h ::comment syllable final
|
676 |
+
::s ᕽ ::t hk ::comment syllable final
|
677 |
+
::s ᓫ ::t l ::comment syllable final
|
678 |
+
::s ᕑ ::t r ::comment syllable final
|
679 |
+
|
680 |
+
## Punctuation
|
681 |
+
# delete
|
682 |
+
::s ¿ ::t "" ::comment inverted question mark
|
683 |
+
::s ¡ ::t "" ::comment inverted exclamation mark
|
684 |
+
# preserve
|
685 |
+
::s ′ ::t ′
|
686 |
+
# Cyrillic
|
687 |
+
::s ⁙ ::t . ::comment five dot punctuation
|
688 |
+
# Amharic/Ethiopian
|
689 |
+
::s ። ::t .
|
690 |
+
::s ፣ ::t ,
|
691 |
+
::s ፤ ::t ;
|
692 |
+
::s ፥ ::t :
|
693 |
+
::s ፡ ::t " " ::comment Ethiopic wordspace
|
694 |
+
::s ፦ ::t : ::comment Ethiopic preface colon
|
695 |
+
::s ቸ ::t cha ::comment Ethiopic syllable ca
|
696 |
+
::s ቹ ::t chu ::comment Ethiopic syllable cu
|
697 |
+
::s ቺ ::t chi ::comment Ethiopic syllable ci
|
698 |
+
::s ቻ ::t chaa ::comment Ethiopic syllable caa
|
699 |
+
::s ቼ ::t chee ::comment Ethiopic syllable cee
|
700 |
+
::s ች ::t che ::comment Ethiopic syllable ce
|
701 |
+
::s ቾ ::t cho ::comment Ethiopic syllable co
|
702 |
+
::s ሠ ::t sa ::comment Ethiopic syllable sza
|
703 |
+
::s ሡ ::t su ::comment Ethiopic syllable szu
|
704 |
+
::s ሢ ::t si ::comment Ethiopic syllable szi
|
705 |
+
::s ሣ ::t saa ::comment Ethiopic syllable szaa
|
706 |
+
::s ሤ ::t see::comment Ethiopic syllable szee
|
707 |
+
::s ሥ ::t se ::comment Ethiopic syllable sze
|
708 |
+
::s ሦ ::t so ::comment Ethiopic syllable szo
|
709 |
+
::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
|
710 |
+
::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
|
711 |
+
::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
|
712 |
+
::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
|
713 |
+
::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
|
714 |
+
::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
|
715 |
+
::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
|
716 |
+
|
717 |
+
# Devanagari (Hindi etc.)
|
718 |
+
::s । ::t . ::comment danda
|
719 |
+
::s ॥ ::t . ::comment double danda
|
720 |
+
::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
|
721 |
+
::s ॰ ::t . ::comment Devanagari abbreviation sign
|
722 |
+
# Oriya/Odia (India)
|
723 |
+
::s ::t . ::comment danda (deprecated, should use Devanagari danda ।)
|
724 |
+
::s ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
|
725 |
+
# Tibetan
|
726 |
+
::s ། ::t ,
|
727 |
+
::s །: ::t :
|
728 |
+
::s ༏ ::t ;
|
729 |
+
::s ༎ ::t .
|
730 |
+
::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
|
731 |
+
::s ༼ ::t ( ::comment Tibetan open roof punctuation
|
732 |
+
::s ༽ ::t ) ::comment Tibetan close roof punctuation
|
733 |
+
::s ༈ ::t "" ::comment Tibetan mark srbul shad
|
734 |
+
::s 【 ::t [ ::comment left black lenticular bracket
|
735 |
+
::s 】 ::t ] ::comment right black lenticular bracket
|
736 |
+
::s ༄ ::t "" ::comment Tibetan head mark
|
737 |
+
::s ༄༅ ::t "" ::comment Tibetan head mark
|
738 |
+
::s ༆ ::t "" ::comment Tibetan head mark
|
739 |
+
# Myanmar/Burmese
|
740 |
+
::s ၊ ::t ,
|
741 |
+
::s ။ ::t .
|
742 |
+
Khmer
|
743 |
+
::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
|
744 |
+
::s ។ ::t . ::comment Khmer sign khan
|
745 |
+
# Arabic
|
746 |
+
::s ، ::t ,
|
747 |
+
::s ؛ ::t ;
|
748 |
+
::s ٬ ::t ,
|
749 |
+
::s ۔ ::t .
|
750 |
+
::s ؟ ::t ?
|
751 |
+
::s ٪ ::t %
|
752 |
+
::s ٫ ::t , ::comment Arabic decimal separator
|
753 |
+
::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
|
754 |
+
# Aramaic
|
755 |
+
::s ܀ ::t .
|
756 |
+
::s ܂ ::t .
|
757 |
+
# Hebrew
|
758 |
+
::s ־ ::t - ::comment maqaf
|
759 |
+
# Armenian
|
760 |
+
::s ։ ::t .
|
761 |
+
::s ՝ ::t , ::comment Armenian comma
|
762 |
+
# Chinese
|
763 |
+
::s , ::t ", "
|
764 |
+
::s 、 ::t ", "
|
765 |
+
::s 。 ::t ". "
|
766 |
+
::s ! ::t "! "
|
767 |
+
::s ? ::t "? "
|
768 |
+
::s 「 ::t ' "'
|
769 |
+
::s 」 ::t '" '
|
770 |
+
::s 《 ::t ' "'
|
771 |
+
::s 》 ::t '" '
|
772 |
+
::s ( ::t " ("
|
773 |
+
::s ) ::t ") "
|
774 |
+
::s ; ::t ;
|
775 |
+
::s : ::t ": "
|
776 |
+
::s ︰ ::t ": "
|
777 |
+
::s - ::t -
|
778 |
+
::s / ::t /
|
779 |
+
::s = ::t =
|
780 |
+
::s ~ ::t ~
|
781 |
+
::s & ::t &
|
782 |
+
::s < ::t <
|
783 |
+
::s > ::t >
|
784 |
+
::s % ::t %
|
785 |
+
::s ::t " " ::comment ideographic space
|
786 |
+
# Japanese
|
787 |
+
::s 『 ::t ' "'
|
788 |
+
::s 』 ::t '" '
|
789 |
+
::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
|
790 |
+
|
791 |
+
# Symbols
|
792 |
+
::s ∞ ::t ∞ ::comment infinity
|
793 |
+
::s ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
|
794 |
+
::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
|
795 |
+
::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
|
796 |
+
::s ﹐ ::t , ::comment small comma; map to regular comma
|
797 |
+
::s ˚ ::t ° ::comment ring above; map to degree sign
|
798 |
+
::s ⇒ ::t ⇒ ::comment rightwards double arrow
|
799 |
+
::s † ::t † ::comment dagger
|
800 |
+
::s • ::t • ::comment bullet
|
801 |
+
::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
|
802 |
+
::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
|
803 |
+
::s ― ::t ― ::comment horizontal bar
|
804 |
+
::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
|
805 |
+
::s ″ ::t ″ ::comment double prime
|
806 |
+
::s ﴾ ::t ( ::comment ornate left parenthesis
|
807 |
+
::s ﴿ ::t ) ::comment ornate right parenthesis
|
808 |
+
::s 〔 ::t [ ::comment left tortoise shell bracket
|
809 |
+
::s 〕 ::t ] ::comment right tortoise shell bracket
|
810 |
+
::s ﹝ ::t ( ::comment small left tortoise shell bracket
|
811 |
+
::s ﹞ ::t ) ::comment small left tortoise shell bracket
|
812 |
+
::s ♄ ::t ♄ ::comment Saturn
|
813 |
+
::s ♆ ::t ♆ ::comment Neptune
|
814 |
+
::s ♋ ::t ♋ ::comment Cancer
|
uroman/data/string-distance-cost-rules.txt
ADDED
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# String distance
|
2 |
+
|
3 |
+
::s1 a ::s2 ::cost 0.1
|
4 |
+
::s1 b ::s2 ::cost 1
|
5 |
+
::s1 b ::s2 ::cost 0.2 ::left1 /[aou]m$/ ::right1 [e] ::lc1 eng ::lc2 zho ::example Balcombe
|
6 |
+
::s1 c ::s2 ::cost 1
|
7 |
+
::s1 c ::s2 ::cost 0.2 ::left1 /[aeou]$/ ::right1 [cgkq] ::lc2 zho
|
8 |
+
::s1 c ::s2 ::cost 0.5 ::left1 /[aeou][lnr]?$/ ::right1 [h] ::lc2 zho
|
9 |
+
::s1 d ::s2 ::cost 1
|
10 |
+
::s1 d ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]$/ ::right1 [-,$ ]
|
11 |
+
::s1 d ::s2 ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [bcfgklmnpqrstvwxz]
|
12 |
+
::s1 e ::s2 ::cost 0.1
|
13 |
+
::s1 é ::s2 ::cost 0.1
|
14 |
+
::s1 e ::s2 ::cost 0.02 ::lc2 fas
|
15 |
+
::s1 e ::s2 ::cost 0.02 ::lc1 amh ::lc2 eng
|
16 |
+
::s1 f ::s2 ::cost 1
|
17 |
+
::s1 g ::s2 ::cost 1
|
18 |
+
::s1 g ::s2 ::cost 0.4 ::right1 [bcdfghklmnpqrstvwxz] ::lc2 zho
|
19 |
+
::s1 g ::s2 ::cost 0.2 ::right1 [k] ::lc2 zho
|
20 |
+
::s1 h ::s2 ::cost 0.5
|
21 |
+
::s1 h ::s2 ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
22 |
+
::s1 h ::s2 ::cost 0.2 ::left1 /[bdlnr]$/ ::right1 [-,$ aeiouy] ::example Delhi, Minh, Riyadh
|
23 |
+
::s1 i ::s2 ::cost 0.1
|
24 |
+
::s1 j ::s2 ::cost 0.5
|
25 |
+
::s1 k ::s2 ::cost 1
|
26 |
+
::s1 l ::s2 ::cost 1
|
27 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /eui$/ ::right1 [-,$ ] ::example Argenteuil
|
28 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /a$/ ::right1 [km] ::comment walk, palm
|
29 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [bdfgkmpstvwz] ::lc2 zho
|
30 |
+
::s1 m ::s2 ::cost 1
|
31 |
+
::s1 n ::s2 ::cost 1
|
32 |
+
::s1 n ::s2 ::cost 0.7 ::right1 [-,$ ]
|
33 |
+
::s1 o ::s2 ::cost 0.1
|
34 |
+
::s1 p ::s2 ::cost 1
|
35 |
+
::s1 q ::s2 ::cost 1
|
36 |
+
::s1 r ::s2 ::cost 1
|
37 |
+
::s1 r ::s2 ::cost 0.5 ::left1 /[aou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ]
|
38 |
+
::s1 r ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
39 |
+
::s1 re ::s2 ::cost 0.4 ::left1 /[ou]$/ ::right1 [-,$ ] ::lc2 zho
|
40 |
+
::s1 re ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
41 |
+
::s1 rr ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
42 |
+
::s1 s ::s2 ::cost 1
|
43 |
+
::s1 s ::s2 ::cost 0.6 ::right1 [-,$ ]
|
44 |
+
::s1 t ::s2 ::cost 1
|
45 |
+
::s1 t ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]?$/ ::right1 [-,$ ]
|
46 |
+
::s1 t ::s2 ::cost 0.6 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz]
|
47 |
+
::s1 u ::s2 ::cost 0.1
|
48 |
+
::s1 v ::s2 ::cost 1
|
49 |
+
::s1 w ::s2 ::cost 1
|
50 |
+
::s1 w ::s2 ::cost 0.4 ::lc1 eng ::right1 [i][c][hk][-,$ ] ::example Greenwich, Alnwick
|
51 |
+
::s1 x ::s2 ::cost 1
|
52 |
+
::s1 y ::s2 ::cost 0.3
|
53 |
+
::s1 z ::s2 ::cost 1
|
54 |
+
::s1 ı ::s2 ::cost 0.3
|
55 |
+
::s1 0 ::s2 ::cost 1
|
56 |
+
::s1 1 ::s2 ::cost 1
|
57 |
+
::s1 2 ::s2 ::cost 1
|
58 |
+
::s1 3 ::s2 ::cost 1
|
59 |
+
::s1 4 ::s2 ::cost 1
|
60 |
+
::s1 5 ::s2 ::cost 1
|
61 |
+
::s1 6 ::s2 ::cost 1
|
62 |
+
::s1 7 ::s2 ::cost 1
|
63 |
+
::s1 8 ::s2 ::cost 1
|
64 |
+
::s1 9 ::s2 ::cost 1
|
65 |
+
::s1 ' ::s2 ::cost 0.1
|
66 |
+
::s1 ` ::s2 ::cost 0.1
|
67 |
+
::s1 ( ::s2 ::cost 0.1
|
68 |
+
::s1 ) ::s2 ::cost 0.1
|
69 |
+
::s1 , ::s2 ::cost 0.1
|
70 |
+
::s1 ; ::s2 ::cost 0.1
|
71 |
+
::s1 - ::s2 ::cost 0.1
|
72 |
+
::s1 . ::s2 ::cost 0.1
|
73 |
+
::s1 .. ::s2 ::cost 0.12
|
74 |
+
::s1 ... ::s2 ::cost 0.14
|
75 |
+
::s1 ? ::s2 ::cost 0.2
|
76 |
+
::s1 ! ::s2 ::cost 0.2
|
77 |
+
::s1 ‼ ::s2 ::cost 0.2
|
78 |
+
::s1 ‼ ::s2 !! ::cost 0.02
|
79 |
+
::s1 ‼ ::s2 ! ::cost 0.1
|
80 |
+
::s1 / ::s2 ::cost 0.1
|
81 |
+
::s1 : ::s2 ::cost 0.1
|
82 |
+
::s1 ː ::s2 ::cost 0.1
|
83 |
+
::s1 ː ::s2 : ::cost 0.1
|
84 |
+
::s1 « ::s2 ::cost 0.1
|
85 |
+
::s1 » ::s2 ::cost 0.1
|
86 |
+
::s1 – ::s2 ::cost 0.1
|
87 |
+
::s1 – ::s2 - ::cost 0.05
|
88 |
+
::s1 — ::s2 ::cost 0.15
|
89 |
+
::s1 — ::s2 - ::cost 0.1
|
90 |
+
::s1 — ::s2 – ::cost 0.05
|
91 |
+
::s1 ─ ::s2 ::cost 0.2
|
92 |
+
::s1 ─ ::s2 - ::cost 0.15
|
93 |
+
::s1 ─ ::s2 – ::cost 0.1
|
94 |
+
::s1 ─ ::s2 — ::cost 0.05
|
95 |
+
::s1 ’ ::s2 ::cost 0.1
|
96 |
+
::s1 ʼ ::s2 ::cost 0.1
|
97 |
+
::s1 " " ::s2 ::cost 0.1
|
98 |
+
::s1 “ ::s2 ::cost 0.1
|
99 |
+
::s1 ” ::s2 ::cost 0.1
|
100 |
+
::s1 ″ ::s2 ::cost 0.1
|
101 |
+
::s1 # ::s2 ::cost 0.3
|
102 |
+
::s1 + ::s2 ::cost 0.3
|
103 |
+
::s1 * ::s2 ::cost 0.3
|
104 |
+
::s1 = ::s2 ::cost 0.3
|
105 |
+
::s1 < ::s2 ::cost 0.3
|
106 |
+
::s1 > ::s2 ::cost 0.3
|
107 |
+
::s1 [ ::s2 ::cost 0.3
|
108 |
+
::s1 ] ::s2 ::cost 0.3
|
109 |
+
::s1 { ::s2 ::cost 0.3
|
110 |
+
::s1 } ::s2 ::cost 0.3
|
111 |
+
::s1 | ::s2 ::cost 0.3
|
112 |
+
::s1 & ::s2 ::cost 0.3
|
113 |
+
::s1 _ ::s2 ::cost 0.3
|
114 |
+
::s1 • ::s2 ::cost 0.1
|
115 |
+
::s1 · ::s2 ::cost 0.1
|
116 |
+
::s1 ◦ ::s2 ::cost 0.1
|
117 |
+
::s1 ° ::s2 ::cost 0.1
|
118 |
+
::s1 … ::s2 ::cost 0.1
|
119 |
+
::s1 … ::s2 ... ::cost 0
|
120 |
+
::s1 @ ::s2 ::cost 0.3
|
121 |
+
::s1 © ::s2 ::cost 0.3
|
122 |
+
::s1 © ::s2 (c) ::cost 0.1
|
123 |
+
|
124 |
+
|
125 |
+
::s1 a ::s2 aa ::cost 0.02
|
126 |
+
::s1 a ::s2 aaa ::cost 0.03
|
127 |
+
::s1 a ::s2 aaaa ::cost 0.03
|
128 |
+
::s1 a ::s2 aaaaa ::cost 0.03
|
129 |
+
::s1 a ::s2 aaaaaa ::cost 0.04
|
130 |
+
::s1 a ::s2 aaaaaaa ::cost 0.04
|
131 |
+
::s1 a ::s2 aaaaaaaa ::cost 0.04
|
132 |
+
::s1 a ::s2 aaaaaaaaa ::cost 0.04
|
133 |
+
::s1 a ::s2 aaaaaaaaaa ::cost 0.04
|
134 |
+
::s1 a ::s2 aaaaaaaaaaa ::cost 0.04
|
135 |
+
::s1 a ::s2 aaaaaaaaaaaa ::cost 0.04
|
136 |
+
::s1 a ::s2 aaaaaaaaaaaaa ::cost 0.04
|
137 |
+
::s1 a ::s2 aaaaaaaaaaaaaa ::cost 0.04
|
138 |
+
::s1 a ::s2 aaaaaaaaaaaaaaa ::cost 0.04
|
139 |
+
::s1 a ::s2 aaaaaaaaaaaaaaaa ::cost 0.04
|
140 |
+
::s1 b ::s2 bb ::cost 0.02
|
141 |
+
::s1 b ::s2 bbb ::cost 0.03
|
142 |
+
::s1 b ::s2 bbbb ::cost 0.03
|
143 |
+
::s1 b ::s2 bbbbb ::cost 0.03
|
144 |
+
::s1 c ::s2 cc ::cost 0.02
|
145 |
+
::s1 c ::s2 ccc ::cost 0.03
|
146 |
+
::s1 c ::s2 cccc ::cost 0.03
|
147 |
+
::s1 c ::s2 ccccc ::cost 0.03
|
148 |
+
::s1 d ::s2 dd ::cost 0.02
|
149 |
+
::s1 d ::s2 ddd ::cost 0.03
|
150 |
+
::s1 d ::s2 dddd ::cost 0.03
|
151 |
+
::s1 d ::s2 ddddd ::cost 0.03
|
152 |
+
::s1 e ::s2 ee ::cost 0.02
|
153 |
+
::s1 e ::s2 eee ::cost 0.03
|
154 |
+
::s1 e ::s2 eeee ::cost 0.03
|
155 |
+
::s1 e ::s2 eeeee ::cost 0.03
|
156 |
+
::s1 e ::s2 eeeeee ::cost 0.04
|
157 |
+
::s1 e ::s2 eeeeeee ::cost 0.04
|
158 |
+
::s1 e ::s2 eeeeeeee ::cost 0.04
|
159 |
+
::s1 e ::s2 eeeeeeeee ::cost 0.04
|
160 |
+
::s1 e ::s2 eeeeeeeeee ::cost 0.04
|
161 |
+
::s1 e ::s2 eeeeeeeeeee ::cost 0.04
|
162 |
+
::s1 e ::s2 eeeeeeeeeeee ::cost 0.04
|
163 |
+
::s1 e ::s2 eeeeeeeeeeeee ::cost 0.04
|
164 |
+
::s1 e ::s2 eeeeeeeeeeeeee ::cost 0.04
|
165 |
+
::s1 e ::s2 eeeeeeeeeeeeeee ::cost 0.04
|
166 |
+
::s1 e ::s2 eeeeeeeeeeeeeeee ::cost 0.04
|
167 |
+
::s1 f ::s2 ff ::cost 0.02
|
168 |
+
::s1 f ::s2 fff ::cost 0.03
|
169 |
+
::s1 f ::s2 ffff ::cost 0.03
|
170 |
+
::s1 f ::s2 fffff ::cost 0.03
|
171 |
+
::s1 g ::s2 gg ::cost 0.02
|
172 |
+
::s1 g ::s2 ggg ::cost 0.03
|
173 |
+
::s1 g ::s2 gggg ::cost 0.03
|
174 |
+
::s1 g ::s2 ggggg ::cost 0.03
|
175 |
+
::s1 h ::s2 hh ::cost 0.02
|
176 |
+
::s1 h ::s2 hhh ::cost 0.03
|
177 |
+
::s1 h ::s2 hhhh ::cost 0.03
|
178 |
+
::s1 h ::s2 hhhhh ::cost 0.03
|
179 |
+
::s1 i ::s2 ii ::cost 0.02
|
180 |
+
::s1 i ::s2 iii ::cost 0.03
|
181 |
+
::s1 i ::s2 iiii ::cost 0.03
|
182 |
+
::s1 i ::s2 iiiii ::cost 0.03
|
183 |
+
::s1 i ::s2 iiiiii ::cost 0.04
|
184 |
+
::s1 i ::s2 iiiiiii ::cost 0.04
|
185 |
+
::s1 i ::s2 iiiiiiii ::cost 0.04
|
186 |
+
::s1 i ::s2 iiiiiiiii ::cost 0.04
|
187 |
+
::s1 i ::s2 iiiiiiiiii ::cost 0.04
|
188 |
+
::s1 i ::s2 iiiiiiiiiii ::cost 0.04
|
189 |
+
::s1 i ::s2 iiiiiiiiiiii ::cost 0.04
|
190 |
+
::s1 i ::s2 iiiiiiiiiiiii ::cost 0.04
|
191 |
+
::s1 i ::s2 iiiiiiiiiiiiii ::cost 0.04
|
192 |
+
::s1 i ::s2 iiiiiiiiiiiiiii ::cost 0.04
|
193 |
+
::s1 i ::s2 iiiiiiiiiiiiiiii ::cost 0.04
|
194 |
+
::s1 j ::s2 jj ::cost 0.02
|
195 |
+
::s1 j ::s2 jjj ::cost 0.03
|
196 |
+
::s1 j ::s2 jjjj ::cost 0.03
|
197 |
+
::s1 j ::s2 jjjjj ::cost 0.03
|
198 |
+
::s1 k ::s2 kk ::cost 0.02
|
199 |
+
::s1 k ::s2 kkk ::cost 0.03
|
200 |
+
::s1 k ::s2 kkkk ::cost 0.03
|
201 |
+
::s1 k ::s2 kkkkk ::cost 0.03
|
202 |
+
::s1 l ::s2 ll ::cost 0.02
|
203 |
+
::s1 l ::s2 lll ::cost 0.03
|
204 |
+
::s1 l ::s2 llll ::cost 0.03
|
205 |
+
::s1 l ::s2 lllll ::cost 0.03
|
206 |
+
::s1 m ::s2 mm ::cost 0.02
|
207 |
+
::s1 m ::s2 mmm ::cost 0.03
|
208 |
+
::s1 m ::s2 mmmm ::cost 0.03
|
209 |
+
::s1 m ::s2 mmmmm ::cost 0.03
|
210 |
+
::s1 n ::s2 nn ::cost 0.02
|
211 |
+
::s1 n ::s2 nnn ::cost 0.03
|
212 |
+
::s1 n ::s2 nnnn ::cost 0.03
|
213 |
+
::s1 n ::s2 nnnnn ::cost 0.03
|
214 |
+
::s1 o ::s2 oo ::cost 0.02
|
215 |
+
::s1 o ::s2 ooo ::cost 0.03
|
216 |
+
::s1 o ::s2 oooo ::cost 0.03
|
217 |
+
::s1 o ::s2 ooooo ::cost 0.03
|
218 |
+
::s1 o ::s2 oooooo ::cost 0.04
|
219 |
+
::s1 o ::s2 ooooooo ::cost 0.04
|
220 |
+
::s1 o ::s2 oooooooo ::cost 0.04
|
221 |
+
::s1 o ::s2 ooooooooo ::cost 0.04
|
222 |
+
::s1 o ::s2 oooooooooo ::cost 0.04
|
223 |
+
::s1 o ::s2 ooooooooooo ::cost 0.04
|
224 |
+
::s1 o ::s2 oooooooooooo ::cost 0.04
|
225 |
+
::s1 o ::s2 ooooooooooooo ::cost 0.04
|
226 |
+
::s1 o ::s2 oooooooooooooo ::cost 0.04
|
227 |
+
::s1 o ::s2 ooooooooooooooo ::cost 0.04
|
228 |
+
::s1 o ::s2 oooooooooooooooo ::cost 0.04
|
229 |
+
::s1 p ::s2 pp ::cost 0.02
|
230 |
+
::s1 p ::s2 ppp ::cost 0.03
|
231 |
+
::s1 p ::s2 pppp ::cost 0.03
|
232 |
+
::s1 p ::s2 ppppp ::cost 0.03
|
233 |
+
::s1 q ::s2 qq ::cost 0.02
|
234 |
+
::s1 q ::s2 qqq ::cost 0.03
|
235 |
+
::s1 q ::s2 qqqq ::cost 0.03
|
236 |
+
::s1 q ::s2 qqqqq ::cost 0.03
|
237 |
+
::s1 r ::s2 rr ::cost 0.02
|
238 |
+
::s1 r ::s2 rrr ::cost 0.03
|
239 |
+
::s1 r ::s2 rrrr ::cost 0.03
|
240 |
+
::s1 r ::s2 rrrrr ::cost 0.03
|
241 |
+
::s1 s ::s2 ss ::cost 0.02
|
242 |
+
::s1 s ::s2 sss ::cost 0.03
|
243 |
+
::s1 s ::s2 ssss ::cost 0.03
|
244 |
+
::s1 s ::s2 sssss ::cost 0.03
|
245 |
+
::s1 t ::s2 tt ::cost 0.02
|
246 |
+
::s1 t ::s2 ttt ::cost 0.03
|
247 |
+
::s1 t ::s2 tttt ::cost 0.03
|
248 |
+
::s1 t ::s2 ttttt ::cost 0.03
|
249 |
+
::s1 u ::s2 uu ::cost 0.02
|
250 |
+
::s1 u ::s2 uuu ::cost 0.03
|
251 |
+
::s1 u ::s2 uuuu ::cost 0.03
|
252 |
+
::s1 u ::s2 uuuuu ::cost 0.03
|
253 |
+
::s1 u ::s2 uuuuuu ::cost 0.04
|
254 |
+
::s1 u ::s2 uuuuuuu ::cost 0.04
|
255 |
+
::s1 u ::s2 uuuuuuuu ::cost 0.04
|
256 |
+
::s1 u ::s2 uuuuuuuuu ::cost 0.04
|
257 |
+
::s1 u ::s2 uuuuuuuuuu ::cost 0.04
|
258 |
+
::s1 u ::s2 uuuuuuuuuuu ::cost 0.04
|
259 |
+
::s1 u ::s2 uuuuuuuuuuuu ::cost 0.04
|
260 |
+
::s1 u ::s2 uuuuuuuuuuuuu ::cost 0.04
|
261 |
+
::s1 u ::s2 uuuuuuuuuuuuuu ::cost 0.04
|
262 |
+
::s1 u ::s2 uuuuuuuuuuuuuuu ::cost 0.04
|
263 |
+
::s1 u ::s2 uuuuuuuuuuuuuuuu ::cost 0.04
|
264 |
+
::s1 v ::s2 vv ::cost 0.02
|
265 |
+
::s1 v ::s2 vvv ::cost 0.03
|
266 |
+
::s1 v ::s2 vvvv ::cost 0.03
|
267 |
+
::s1 v ::s2 vvvvv ::cost 0.03
|
268 |
+
::s1 w ::s2 ww ::cost 0.02
|
269 |
+
::s1 w ::s2 www ::cost 0.03
|
270 |
+
::s1 w ::s2 wwww ::cost 0.03
|
271 |
+
::s1 w ::s2 wwwww ::cost 0.03
|
272 |
+
::s1 x ::s2 xx ::cost 0.02
|
273 |
+
::s1 x ::s2 xxx ::cost 0.03
|
274 |
+
::s1 x ::s2 xxxx ::cost 0.03
|
275 |
+
::s1 x ::s2 xxxxx ::cost 0.03
|
276 |
+
::s1 y ::s2 yy ::cost 0.02
|
277 |
+
::s1 y ::s2 yyy ::cost 0.03
|
278 |
+
::s1 y ::s2 yyyy ::cost 0.03
|
279 |
+
::s1 y ::s2 yyyyy ::cost 0.03
|
280 |
+
::s1 z ::s2 zz ::cost 0.02
|
281 |
+
::s1 z ::s2 zzz ::cost 0.03
|
282 |
+
::s1 z ::s2 zzzz ::cost 0.03
|
283 |
+
::s1 z ::s2 zzzzz ::cost 0.03
|
284 |
+
::s1 " " ::s2 " " ::cost 0
|
285 |
+
::s1 . ::s2 ::left1 /\./ ::left2 /\./ ::cost 0.02
|
286 |
+
::s1 … ::s2 ::left1 /…/ ::left2 /…/ ::cost 0.01
|
287 |
+
::s1 _ ::s2 ::left1 /_/ ::left2 /_/ ::cost 0.01
|
288 |
+
::s1 = ::s2 ::left1 /=/ ::left2 /=/ ::cost 0.01
|
289 |
+
::s1 ! ::s2 ::left1 /!/ ::left2 /!/ ::cost 0.02
|
290 |
+
::s1 ? ::s2 ::left1 /\?/ ::left2 /\?/ ::cost 0.02
|
291 |
+
::s1 aa ::s2 aː ::cost 0.02
|
292 |
+
::s1 ee ::s2 eː ::cost 0.02
|
293 |
+
::s1 ii ::s2 iː ::cost 0.02
|
294 |
+
::s1 oo ::s2 oː ::cost 0.02
|
295 |
+
::s1 uu ::s2 uː ::cost 0.02
|
296 |
+
|
297 |
+
::s1 a ::s2 e ::cost 0.1
|
298 |
+
::s1 au ::s2 o ::cost 0.1 ::lc1 eng
|
299 |
+
::s1 aw ::s2 o ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
300 |
+
::s1 aw ::s2 o ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
301 |
+
::s1 aw ::s2 a ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
302 |
+
::s1 ay ::s2 i ::cost 0.02 ::lc1 fas ::lc2 eng
|
303 |
+
::s1 aye ::s2 ae ::cost 0.05 ::lc1 fas
|
304 |
+
::s1 é ::s2 e ::cost 0.05
|
305 |
+
::s1 e ::s2 i ::cost 0.15
|
306 |
+
::s1 e ::s2 i ::cost 0.1 ::lc1 uig ::lc2 uig
|
307 |
+
::s1 e ::s2 y ::cost 0.15
|
308 |
+
::s1 ew ::s2 u ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
309 |
+
::s1 ew ::s2 u ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
310 |
+
::s1 ew ::s2 u ::cost 0.3 ::right1 [aei][lgnrst] ::lc1 eng
|
311 |
+
::s1 ew ::s2 e ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
312 |
+
::s1 i ::s2 a ::cost 0.1 ::right1 [-,$ ] ::lc1 fas
|
313 |
+
::s1 i ::s2 ea ::cost 0.03 ::lc2 eng
|
314 |
+
::s1 i ::s2 ee ::cost 0.03 ::lc2 eng
|
315 |
+
::s1 i ::s2 ei ::cost 0.05 ::lc2 eng
|
316 |
+
::s1 i ::s2 ie ::cost 0.03 ::lc2 eng
|
317 |
+
::s1 i ::s2 ı ::cost 0.05
|
318 |
+
::s1 i ::s2 e ::cost 0.1 ::lc2 eng
|
319 |
+
::s1 i ::s2 y ::cost 0.15
|
320 |
+
::s1 i ::s2 y ::cost 0.1 ::right2 [-,bcdfghklmnpqrstvwxz$ ]
|
321 |
+
::s1 ie ::s2 ei ::cost 0.15
|
322 |
+
::s1 ie ::s2 y ::cost 0.15
|
323 |
+
::s1 ij ::s2 ai ::cost 0.15
|
324 |
+
::s1 o ::s2 u ::cost 0.1
|
325 |
+
::s1 oo ::s2 u ::cost 0.1
|
326 |
+
::s1 ow ::s2 au ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
327 |
+
::s1 ow ::s2 o ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
328 |
+
::s1 ow ::s2 o ::cost 0.2 ::lc1 eng ::lc2 zho ::right1 [e]
|
329 |
+
::s1 ow ::s2 o ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [iy]
|
330 |
+
::s1 u ::s2 a ::cost 0.1 ::lc1 eng ::right1 [-,bcdfghklmnpqrstvwxz][bcdfghklmnpqrstvwxz$ ]
|
331 |
+
::s1 u ::s2 ou ::cost 0.05
|
332 |
+
::s1 u ::s2 yu ::cost 0.05 ::left1 /^(.*[- ])?$/
|
333 |
+
::s1 yeo ::s2 eo ::cost 0.1 ::lc1 fas
|
334 |
+
|
335 |
+
# Amharic
|
336 |
+
::s1 a ::s2 e ::cost 0.05 ::lc1 amh
|
337 |
+
::s1 aa ::s2 o ::cost 0.15 ::lc1 amh
|
338 |
+
::s1 aawe ::s2 au ::cost 0.05 ::lc1 amh
|
339 |
+
::s1 aawe ::s2 ao ::cost 0.1 ::lc1 amh
|
340 |
+
::s1 aawe ::s2 ou ::cost 0.1 ::lc1 amh
|
341 |
+
::s1 aawo ::s2 ao ::cost 0.05 ::lc1 amh
|
342 |
+
::s1 aaye ::s2 ai ::cost 0.05 ::lc1 amh
|
343 |
+
::s1 aaye ::s2 i ::cost 0.1 ::lc1 amh
|
344 |
+
::s1 aaye ::s2 ei ::cost 0.1 ::lc1 amh
|
345 |
+
::s1 awe ::s2 au ::cost 0.05 ::lc1 amh
|
346 |
+
::s1 awe ::s2 ao ::cost 0.1 ::lc1 amh
|
347 |
+
::s1 awe ::s2 ou ::cost 0.1 ::lc1 amh
|
348 |
+
::s1 ee ::s2 ai ::cost 0.1 ::lc1 amh
|
349 |
+
::s1 eewo ::s2 eo ::cost 0.05 ::lc1 amh
|
350 |
+
::s1 eeyaa ::s2 ea ::cost 0.1 ::lc1 amh
|
351 |
+
::s1 eeye ::s2 ai ::cost 0.1 ::lc1 amh
|
352 |
+
::s1 ewee ::s2 ue ::cost 0.1 ::lc1 amh
|
353 |
+
::s1 gwaa ::s2 gua ::cost 0.05 ::lc1 amh
|
354 |
+
::s1 iya ::s2 ie ::cost 0.05 ::lc1 amh
|
355 |
+
::s1 iyaa ::s2 ia ::cost 0.05 ::lc1 amh
|
356 |
+
::s1 iyo ::s2 io ::cost 0.05 ::lc1 amh
|
357 |
+
::s1 kxaa ::s2 kha ::cost 0.05 ::lc1 amh
|
358 |
+
::s1 liyaa ::s2 llia ::cost 0.05 ::lc1 amh
|
359 |
+
::s2 qaa ::s2 cca ::cost 0.05 ::lc1 amh
|
360 |
+
::s1 uwaa ::s2 ua ::cost 0.05 ::lc1 amh
|
361 |
+
::s1 uwee ::s2 ue ::cost 0.05 ::lc1 amh
|
362 |
+
::s1 uwi ::s2 oui ::cost 0.05 ::lc1 amh
|
363 |
+
::s1 uwi ::s2 ui ::cost 0.05 ::lc1 amh
|
364 |
+
::s1 xaaye ::s2 hai ::cost 0.1 ::lc1 amh
|
365 |
+
::s1 xwaa ::s2 jua ::cost 0.1 ::lc1 amh
|
366 |
+
::s1 ziyaa ::s1 sia ::cost 0.05 ::lc1 amh
|
367 |
+
::s1 w ::s2 ::cost 0.3 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
|
368 |
+
::s1 y ::s2 ::cost 0.1 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
|
369 |
+
# abbreviations
|
370 |
+
::s1 ee. ::s2 a ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
371 |
+
::s1 si. ::s2 c ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
372 |
+
::s1 di. ::s2 d ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
373 |
+
::s1 eefe. ::s2 f ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
374 |
+
::s1 are. ::s2 r ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
375 |
+
|
376 |
+
# Arabic
|
377 |
+
::s1 ::s2 a ::cost 0.02 ::lc1 ara
|
378 |
+
::s1 ::s2 e ::cost 0.02 ::lc1 ara
|
379 |
+
::s1 ::s2 i ::cost 0.05 ::lc1 ara
|
380 |
+
::s1 ::s2 o ::cost 0.05 ::lc1 ara
|
381 |
+
::s1 ::s2 p ::cost 0.15 ::lc1 ara ::left2 /m$/ ::right2 [dfgklmnpqrstvwz]
|
382 |
+
::s1 ::s2 u ::cost 0.05 ::lc1 ara
|
383 |
+
::s1 y ::s2 a ::cost 0.15 ::lc1 ara
|
384 |
+
::s1 y ::s2 e ::cost 0.05 ::lc1 ara
|
385 |
+
::s1 y ::s2 ea ::cost 0.02 ::lc1 ara
|
386 |
+
::s1 y ::s2 ee ::cost 0.02 ::lc1 ara
|
387 |
+
::s1 y ::s2 i ::cost 0.02 ::lc1 ara
|
388 |
+
::s1 y ::s2 ie ::cost 0.02 ::lc1 ara
|
389 |
+
::s1 b ::s2 p ::cost 0.02 ::lc1 ara
|
390 |
+
::s1 b ::s2 pp ::cost 0.03 ::lc1 ara
|
391 |
+
::s1 f ::s2 v ::cost 0.02 ::lc1 ara
|
392 |
+
::s1 fyl ::s2 ville ::right2 [-,$ ] ::cost 0.05 ::lc1 ara
|
393 |
+
::s1 gh ::s2 g ::right2 [abcdfgklmnopqrstuvwz] ::cost 0.05 ::lc1 ara
|
394 |
+
::s1 ghz ::s2 gs ::cost 0.05 ::lc1 ara
|
395 |
+
::s1 j ::s2 g ::cost 0.2 ::lc1 ara
|
396 |
+
::s1 kh ::s2 g ::cost 0.3 ::lc1 ara ::right2 [eiy]
|
397 |
+
::s1 q ::s2 g ::cost 0.2 ::lc1 ara ::right2 [arouz]
|
398 |
+
::s1 q ::s2 gg ::cost 0.2 ::lc1 ara ::right2 [arouz]
|
399 |
+
::s1 th ::s2 z ::cost 0.4 ::lc1 ara ::right2 [aou] ::comment Spanish
|
400 |
+
::s1 " (" ::s2 ", " ::cost 0.02 ::lc1 ara
|
401 |
+
::s1 ) ::s2 ::right2 [-,$ ] ::cost 0.02 ::lc1 ara
|
402 |
+
|
403 |
+
# Bengali
|
404 |
+
::s1 aoyaa ::s2 wa ::cost 0.1 ::lc1 ben
|
405 |
+
::s1 aoye ::s2 way ::cost 0.1 ::lc1 ben
|
406 |
+
::s1 bhaa ::s2 ve ::cost 0.1 ::lc1 ben
|
407 |
+
::s1 bh ::s2 v ::cost 0.2 ::lc1 ben
|
408 |
+
::s1 bh ::s2 w ::cost 0.2 ::lc1 ben
|
409 |
+
::s1 b ::s2 v ::cost 0.3 ::lc1 ben
|
410 |
+
::s1 b ::s2 w ::cost 0.3 ::lc1 ben
|
411 |
+
::s1 dda ::s2 rh ::right2 [-,$ ] ::cost 0.2 ::lc1 ben
|
412 |
+
::s1 dd ::s2 r ::cost 0.4 ::lc1 ben
|
413 |
+
::s1 gk ::s2 k ::cost 0.05 ::lc1 ben
|
414 |
+
::s1 h ::s2 g ::right2 [eiy] ::cost 0.4 ::lc1 ben
|
415 |
+
::s1 h ::s2 j ::cost 0.4 ::lc1 ben
|
416 |
+
::s1 hoyaai ::s2 whi ::cost 0.05 ::lc1 ben
|
417 |
+
::s1 j ::s2 z ::cost 0.1 ::lc1 ben
|
418 |
+
::s1 j ::s2 s ::cost 0.3 ::lc1 ben
|
419 |
+
::s1 myaaka ::s2 mc ::cost 0.1 ::lc1 ben
|
420 |
+
::s1 myaaka ::s2 mac ::cost 0.1 ::lc1 ben
|
421 |
+
::s1 oyaa ::s2 wa ::cost 0.02 ::lc1 ben
|
422 |
+
::s1 oyaa ::s2 wo ::cost 0.1 ::lc1 ben
|
423 |
+
::s1 oyena ::s2 owen ::cost 0.1 ::lc1 ben
|
424 |
+
::s1 ph ::s2 v ::cost 0.1 ::lc1 ben
|
425 |
+
::s1 phana ::s2 von ::cost 0.1 ::lc1 ben
|
426 |
+
::s1 rhio ::s2 gio ::cost 0.2 ::lc1 ben
|
427 |
+
::s1 sh ::s2 s ::cost 0.4 ::lc1 ben
|
428 |
+
::s1 ss ::s2 sh ::left1 /[k]$/ ::cost 0.15 ::lc1 ben
|
429 |
+
::s1 ss ::s2 sh ::cost 0.3 ::lc1 ben
|
430 |
+
::s1 o ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
431 |
+
::s1 oye ::s2 we ::cost 0.2 ::lc1 ben
|
432 |
+
::s1 tta ::s2 tho ::cost 0.3 ::lc1 ben
|
433 |
+
::s1 tthaa ::s2 ta ::cost 0.3 ::lc1 ben
|
434 |
+
::s1 u ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
435 |
+
::s1 u ::s2 woo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
436 |
+
::s1 u ::s2 wu ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
437 |
+
::s1 ui ::s2 wi ::cost 0.02 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
438 |
+
::s1 yaa ::s2 wa ::cost 0.3 ::lc1 ben
|
439 |
+
::s1 ye ::s2 we ::cost 0.3 ::lc1 ben
|
440 |
+
|
441 |
+
# Russian
|
442 |
+
::s1 ::s2 os ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
|
443 |
+
::s1 ::s2 us ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
|
444 |
+
::s1 av ::s2 au ::cost 0.05 ::lc1 rus
|
445 |
+
::s1 ch ::s2 cz ::cost 0.1 ::lc1 rus ::comment Polish
|
446 |
+
::s1 chch ::s2 cci ::right2 [aou] ::cost 0.1 ::lc1 rus
|
447 |
+
::s1 chch ::s2 cc ::right2 [eiy] ::cost 0.1 ::lc1 rus
|
448 |
+
::s1 chzh ::s2 zh ::cost 0.1 ::lc1 rus
|
449 |
+
::s1 dz ::s2 zz ::cost 0.1 ::lc1 rus ::right2 [aeiouy]
|
450 |
+
::s1 dz ::s2 j ::cost 0.3 ::lc1 rus ::right2 [aeiouy] ::comment Japanese
|
451 |
+
::s1 dzh ::s2 g ::cost 0.05 ::lc1 rus ::right2 [eiy]
|
452 |
+
::s1 dzh ::s2 gg ::cost 0.05 ::lc1 rus ::right2 [eiy]
|
453 |
+
::s1 dzh ::s2 j ::cost 0.05 ::lc1 rus
|
454 |
+
::s1 ev ::s2 eu ::cost 0.1 ::lc1 rus
|
455 |
+
::s1 f ::s2 th ::cost 0.6 ::lc1 rus
|
456 |
+
::s1 ievye ::s2 iaceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
|
457 |
+
::s1 ii ::s2 ius ::cost 0.2 ::right1 [-,$ ] ::lc1 rus
|
458 |
+
::s1 i ::s2 j ::cost 0.2 ::lc1 rus
|
459 |
+
::s1 naya ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
460 |
+
::s1 nyi ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
461 |
+
::s1 ovye ::s2 aceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
|
462 |
+
::s1 shsh ::s2 sh ::cost 0 ::lc1 rus
|
463 |
+
::s1 skaya ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
464 |
+
::s1 skaya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
465 |
+
::s1 skii ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
466 |
+
::s1 skii ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
467 |
+
::s1 tsian ::s2 tian ::cost 0.05 ::lc1 rus
|
468 |
+
::s1 tsion ::s2 tion ::cost 0.05 ::lc1 rus
|
469 |
+
::s1 ts ::s2 c ::cost 0.3 ::lc1 rus
|
470 |
+
::s1 ts ::s2 c ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
471 |
+
::s1 tsz ::s2 z ::cost 0.1 ::lc1 rus
|
472 |
+
::s1 itsa ::s2 ica ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
473 |
+
::s1 etski ::s2 ecky ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
474 |
+
::s1 tsiya ::s2 tion ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
475 |
+
::s1 tsi ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
|
476 |
+
::s1 tsy ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
|
477 |
+
::s1 tszi ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
|
478 |
+
::s1 tszy ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
|
479 |
+
::s1 u ::s2 w ::right2 [aeio] ::cost 0.05 ::lc1 rus
|
480 |
+
::s1 u ::s2 w ::cost 0.2 ::lc1 rus
|
481 |
+
::s1 uo ::s2 wa ::cost 0.2 ::lc1 rus ::right2 [lnrst]
|
482 |
+
::s1 v ::s2 u ::cost 0.05 ::lc1 rus ::left1 /[bcdfghjklmnpqrstvwxz]$/ ::right1 [aeiou]
|
483 |
+
::s1 gva ::s2 gua ::cost 0.02 ::lc1 rus
|
484 |
+
::s1 gvi ::s2 gui ::cost 0.02 ::lc1 rus
|
485 |
+
::s1 x ::s2 sh ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,aouct$-] ::lc1 rus
|
486 |
+
::s1 y ::s2 s ::cost 0.4 ::right2 [-,$-] ::lc1 rus
|
487 |
+
::s1 zh ::s2 rz ::cost 0.1 ::lc1 rus ::comment Polish rz
|
488 |
+
|
489 |
+
# Russian case endings
|
490 |
+
::s1 em ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
491 |
+
::s1 ey ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
492 |
+
::s1 om ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
493 |
+
::s1 oy ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
494 |
+
::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
495 |
+
::s1 y ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
496 |
+
::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
497 |
+
::s1 ye ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
498 |
+
::s1 yem ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
499 |
+
::s1 ym ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
500 |
+
::s1 ymi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
501 |
+
::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
502 |
+
::s1 ii ::s2 iya ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
|
503 |
+
::s1 ii ::s2 iye ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
|
504 |
+
|
505 |
+
::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
506 |
+
::s1 ami ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
507 |
+
::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
508 |
+
::s1 ev ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
509 |
+
::s1 eri ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
510 |
+
::s1 eryu ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
511 |
+
::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
512 |
+
::s1 ov ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
513 |
+
::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
514 |
+
::s1 ykh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
515 |
+
|
516 |
+
# Ukrainian case endings
|
517 |
+
::s1 eyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
518 |
+
::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
519 |
+
::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
520 |
+
::s1 yi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
521 |
+
::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
522 |
+
|
523 |
+
::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
524 |
+
::s1 amy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
525 |
+
::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
526 |
+
::s1 evy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
527 |
+
::s1 iv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
528 |
+
::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
529 |
+
::s1 ovy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
530 |
+
::s1 yam ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
531 |
+
::s1 yamy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
532 |
+
::s1 yiv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
533 |
+
::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
534 |
+
::s1 yakh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
535 |
+
|
536 |
+
# Uyghur
|
537 |
+
::s1 aw ::s2 ao ::cost 0.05 ::lc1 uig
|
538 |
+
::s1 aw ::s2 au ::cost 0.05 ::lc1 uig
|
539 |
+
::s1 gwi ::s2 gui ::cost 0.05 ::lc1 uig
|
540 |
+
::s1 iye ::s2 ia ::cost 0.05 ::lc1 uig
|
541 |
+
::s1 istan ::s2 ia ::cost 0.1 ::right1 [-,$ ] ::lc1 uig
|
542 |
+
::s1 j ::s2 c ::cost 0.4 ::lc1 uig
|
543 |
+
::s1 q ::s2 h ::cost 0.2 ::lc1 uig
|
544 |
+
::s1 sey ::s2 cai ::cost 0.2 ::lc1 uig
|
545 |
+
::s1 sh ::s2 x ::cost 0.2 ::lc1 uig
|
546 |
+
|
547 |
+
::s1 b ::s2 p ::cost 0.3
|
548 |
+
::s1 b ::s2 v ::cost 0.5 ::left2 /^(.*[- ])?$/
|
549 |
+
::s1 b ::s2 v ::cost 0.7
|
550 |
+
::s1 c ::s2 ch ::cost 0.25 ::right1 [eiy]
|
551 |
+
::s1 c ::s2 ck ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
552 |
+
::s1 c ::s2 k ::cost 0.4
|
553 |
+
::s1 c ::s2 k ::cost 0.05 ::left1 /^(.* )?ma?$/ ::comment MacIntyre
|
554 |
+
::s1 c ::s2 k ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
555 |
+
::s1 c ::s2 kk ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
556 |
+
::s1 c ::s2 s ::cost 0.7
|
557 |
+
::s1 c ::s2 s ::cost 0.1 ::right1 [eiy]
|
558 |
+
::s1 c ::s2 ts ::cost 0.15 ::right1 [eiy]
|
559 |
+
::s1 c ::s2 z ::cost 0.3
|
560 |
+
::s1 ch ::s2 ck ::cost 0.2
|
561 |
+
::s1 ch ::s2 g ::cost 0.3 ::right1 [eiy] ::right2 [eiy]
|
562 |
+
::s1 ch ::s2 k ::cost 0.2
|
563 |
+
::s1 ch ::s2 kk ::cost 0.2
|
564 |
+
::s1 ch ::s2 sh ::cost 0.3
|
565 |
+
::s1 ch ::s2 sh ::cost 0.2 ::left1 /eiy$/ ::right1 [$ ]
|
566 |
+
::s1 ch ::s2 tch ::cost 0.1
|
567 |
+
::s1 ch ::s2 tsh ::cost 0.1
|
568 |
+
::s1 ch ::s2 z ::cost 0.5
|
569 |
+
::s1 ck ::s2 kk ::cost 0.02
|
570 |
+
::s1 cz ::s2 ch ::cost 0.2 ::left1 /i$/
|
571 |
+
::s1 d ::s2 t ::cost 0.3
|
572 |
+
::s1 de ::s2 dre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
|
573 |
+
::s1 dg ::s2 j ::cost 0.6 ::lc1 eng ::comment Cambridge
|
574 |
+
::s1 dg ::s2 j ::cost 0.3 ::right1 [eiy] ::lc1 eng
|
575 |
+
::s1 dg ::s2 j ::cost 0.1 ::right1 [eiy] ::lc1 eng ::lc2 fas, jpn
|
576 |
+
::s1 dt ::s2 d ::cost 0.3
|
577 |
+
::s1 dt ::s2 t ::cost 0.03
|
578 |
+
::s1 dt ::s2 tt ::cost 0.03
|
579 |
+
::s1 f ::s2 p ::cost 0.8
|
580 |
+
::s1 f ::s2 ph ::cost 0.01
|
581 |
+
::s1 ff ::s2 ph ::cost 0.02
|
582 |
+
::s1 f ::s2 pf ::cost 0.1
|
583 |
+
::s1 f ::s2 v ::cost 0.3
|
584 |
+
::s1 f ::s2 v ::cost 0.1 ::right1 [-,$ ]
|
585 |
+
::s1 ef ::s2 ev ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
586 |
+
::s1 f ::s2 w ::cost 0.3
|
587 |
+
::s1 g ::s2 j ::cost 0.6
|
588 |
+
::s1 g ::s2 j ::cost 0.3 ::right1 [eiy]
|
589 |
+
::s1 g ::s2 j ::cost 0.1 ::right1 [eiy] ::lc2 amh, ara, fas, jpn, som
|
590 |
+
::s1 g ::s2 k ::cost 0.3
|
591 |
+
::s1 g ::s2 gh ::cost 0.3
|
592 |
+
::s1 g ::s2 ch ::cost 0.4 ::left1 /[eiy]$/ ::right1 [-,$ ] ::comment German: Ludwig, Braunschweig
|
593 |
+
::s1 gh ::s2 f ::cost 0.2 ::lc1 eng ::comment laughter
|
594 |
+
::s1 gh ::s2 "" ::cost 0.2 ::lc1 eng ::comment daughter
|
595 |
+
::s1 gh ::s2 g ::cost 0.2 ::lc1 eng ::comment Afghanistan
|
596 |
+
::s1 gl ::s2 l ::cost 0.2 ::lc1 eng ::right1 [i]
|
597 |
+
::s1 gn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
598 |
+
::s1 gn ::s2 n ::cost 0.2 ::lc1 eng
|
599 |
+
::s1 gz ::s2 ks ::cost 0.2
|
600 |
+
::s1 h ::s2 e ::cost 0.4 ::lc1 fas
|
601 |
+
::s1 ise ::s2 ize ::cost 0.1
|
602 |
+
::s1 j ::s2 y ::cost 0.2
|
603 |
+
::s1 j ::s2 dj ::cost 0.2
|
604 |
+
::s1 j ::s2 h ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Jose
|
605 |
+
::s1 j ::s2 hh ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Tardajos
|
606 |
+
::s1 j ::s2 zh ::cost 0.2
|
607 |
+
::s1 k ::s2 cc ::cost 0.02 ::right2 [aour]
|
608 |
+
::s1 k ::s2 cc ::cost 0.3
|
609 |
+
::s1 k ::s2 cch ::cost 0.15
|
610 |
+
::s1 k ::s2 ck ::cost 0.02
|
611 |
+
::s1 k ::s2 cq ::cost 0.05
|
612 |
+
::s1 k ::s2 cqu ::cost 0.05
|
613 |
+
::s1 k ::s2 cque ::cost 0.1
|
614 |
+
::s1 k ::s2 cque ::cost 0.05 ::right2 [-,$ ]
|
615 |
+
::s1 k ::s2 cques ::cost 0.05 ::right2 [-,$ ]
|
616 |
+
::s1 k ::s2 q ::cost 0.05
|
617 |
+
::s1 k ::s2 qu ::cost 0.05
|
618 |
+
::s1 k ::s2 que ::cost 0.1
|
619 |
+
::s1 k ::s2 que ::cost 0.05 ::right2 [-,$ ]
|
620 |
+
::s1 k ::s2 ques ::cost 0.1 ::right2 [-,$ ]
|
621 |
+
::s1 kh ::s2 j ::cost 0.2
|
622 |
+
::s1 kh ::s2 q ::cost 0.2
|
623 |
+
::s1 kh ::s2 k ::cost 0.25 ::right1 [aeiouy]
|
624 |
+
::s1 kh ::s2 k ::cost 0.1 ::right1 [aeiouys] ::lc2 amh
|
625 |
+
::s1 kn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
626 |
+
::s1 kj ::s2 sh ::cost 0.2 ::comment Swedish
|
627 |
+
::s1 l ::s2 r ::cost 0.1 ::lc1 zho
|
628 |
+
::s1 aib ::s2 alb ::cost 0.1 ::lc1 zho
|
629 |
+
::s1 al ::s2 ::cost 0.5 ::left1 /^(.* )?$/
|
630 |
+
::s1 al- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
|
631 |
+
::s1 el ::s2 ::cost 0.5 ::left1 /^(.* )?$/
|
632 |
+
::s1 el- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
|
633 |
+
::s1 ll ::s2 y ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::comment Guillermo, Guillaume
|
634 |
+
::s1 mb ::s2 m ::cost 0.2 ::right1 [-,bcdfghklmnpqstvwxz$ ] ::lc1 eng ::comment bomb
|
635 |
+
::s1 n ::s2 m ::cost 0.5 ::left1 /[aeiou]$/ ::left2 /[aeiou]$/ ::right1 [bcdfghklmnpqrstvwxz$ ] ::right2 [-,bcdfghklmnpqrstvwxz$ ]
|
636 |
+
::s1 ng ::s2 n ::cost 0.1 ::left1 /[aeiou]$/ ::lc1 zho
|
637 |
+
::s1 ng ::s2 m ::cost 0.25 ::left1 /[aeiou]$/ ::lc1 zho
|
638 |
+
::s1 ng ::s2 n ::cost 0.1 ::left2 /[aeiou]$/ ::lc2 ara, ben, rus, zho
|
639 |
+
::s1 nm ::s2 m ::cost 0.25 ::lc1 zho ::left1
|
640 |
+
::s1 pn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
641 |
+
::s1 ph ::s2 p ::cost 0.3 ::lc1 amh
|
642 |
+
::s1 q ::s2 c ::cost 0.15
|
643 |
+
::s1 q ::s2 ch ::cost 0.2 ::right2 [eiy]
|
644 |
+
::s1 q ::s2 ck ::cost 0.2
|
645 |
+
::s1 q ::s2 kk ::cost 0.2
|
646 |
+
::s1 q ::s2 gh ::cost 0.2 ::lc1 fas ::right2 [aeiouy]
|
647 |
+
::s1 qi ::s2 ch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
648 |
+
::s1 qi ::s2 cci ::cost 0.1 ::lc1 zho
|
649 |
+
::s1 qi ::s2 chi ::cost 0.1 ::lc1 zho
|
650 |
+
::s1 qi ::s2 tch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
651 |
+
::s1 qi ::s2 ts ::cost 0.4 ::lc1 zho ::right1 [aeou]
|
652 |
+
::s1 qi ::s2 tsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
653 |
+
::s1 qi ::s2 tzsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
654 |
+
::s1 qi ::s2 czy ::cost 0.2 ::lc1 zho
|
655 |
+
::s1 qu ::s2 kw ::cost 0.15
|
656 |
+
::s1 qu ::s2 kv ::cost 0.15
|
657 |
+
::s1 e ::s2 er ::cost 0.25 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::lc1 zho
|
658 |
+
::s1 re ::s2 er ::cost 0.1
|
659 |
+
::s1 rh ::s2 r ::cost 0.05 ::left1 /^(.*[- ])?$/ ::example Rhine
|
660 |
+
::s1 s ::s2 sh ::cost 0.03 ::right2 [aeiou] ::lc2 amh
|
661 |
+
::s1 s ::s2 sz ::cost 0.3 ::lc2 eng ::example Liszt (Hungarian)
|
662 |
+
::s1 s ::s2 ts ::cost 0.4 ::lc1 amh, zho
|
663 |
+
::s1 s ::s2 z ::cost 0.4
|
664 |
+
::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::lc1 eng
|
665 |
+
::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy][bdglmnrvw]?$/ ::right1 [-,$ ] ::lc1 eng
|
666 |
+
::s1 s ::s2 z ::cost 0.2 ::lc2 fas
|
667 |
+
::s1 sc ::s2 s ::cost 0.2 ::right1 [i] ::example Nascimento
|
668 |
+
::s1 sci ::s2 sh ::cost 0.2 ::example Brescia
|
669 |
+
::s1 sch ::s2 sh ::cost 0.1
|
670 |
+
::s1 sh ::s2 sz ::cost 0.2 ::example Mariusz (Polish) ::lc2 eng
|
671 |
+
::s1 si ::s2 j ::cost 0.1 ::right2 [a] ::lc1 eng
|
672 |
+
::s1 ss ::s2 z ::cost 0.5
|
673 |
+
# ::s1 smith ::s2 mith ::cost 0.75 ::lc2 zho ::comment weird, but several different Xinhua examples
|
674 |
+
::s1 tch ::s2 c ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,e$ ]
|
675 |
+
::s1 te ::s2 tre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
|
676 |
+
::s1 th ::s2 t ::cost 0.2 ::lc2 amh, fas, uig
|
677 |
+
::s1 th ::s2 s ::cost 0.4 ::lc2 zho
|
678 |
+
::s1 th ::s2 sth ::cost 0.4 ::lc1 zho
|
679 |
+
::s1 th ::s2 ths ::cost 0.4 ::lc1 zho
|
680 |
+
::s1 th ::s2 z ::cost 0.3 ::lc2 amh ::right2 [-,$ aeot]
|
681 |
+
::s1 v ::s2 w ::cost 0.02
|
682 |
+
::s1 v ::s2 wh ::cost 0.02 ::left1 /^(.* )?$/
|
683 |
+
::s1 vv ::s2 w ::cost 0.02
|
684 |
+
::s1 w ::s2 u ::cost 0.1 ::lc2 uig
|
685 |
+
::s1 wa ::s2 ua ::cost 0.05
|
686 |
+
::s1 wh ::s2 w ::cost 0.05 ::left1 /^(.* )?$/
|
687 |
+
::s1 wr ::s2 r ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
688 |
+
::s1 x ::s2 ks ::cost 0.05
|
689 |
+
::s1 x ::s2 s ::cost 0.2 ::left1 /^(.* )?$/
|
690 |
+
::s1 x ::s2 sh ::cost 0.2 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
|
691 |
+
::s1 x ::s2 z ::cost 0.2 ::left1 /^(.* )?$/ ::right1 [aeiouy]
|
692 |
+
::s1 x ::s2 h ::cost 0.3 ::lc1 uig
|
693 |
+
::s1 x ::s2 h ::cost 0.05 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
|
694 |
+
::s1 x ::s2 kh ::cost 0.1 ::lc1 uig
|
695 |
+
::s1 xi ::s2 sch ::cost 0.2 ::right1 [aeou] ::lc1 zho
|
696 |
+
::s1 xi ::s2 sh ::cost 0.2 ::right1 [aeou] ::lc1 zho
|
697 |
+
::s1 xi ::s2 ch ::cost 0.4 ::right1 [aeou] ::lc1 zho
|
698 |
+
::s1 xi ::s2 sci ::cost 0.4 ::right1 [aeou] ::lc1 zho
|
699 |
+
::s1 xi ::s2 s ::cost 0.6 ::right1 [aeou] ::lc1 zho
|
700 |
+
::s1 z ::s2 dz ::cost 0.1 ::left1 /^(.*[ aeiouy])?[lnr]?$/
|
701 |
+
::s1 z ::s2 ts ::cost 0.15
|
702 |
+
::s1 z ::s2 tz ::cost 0.15
|
703 |
+
::s1 zh ::s2 g ::cost 0.2 ::right2 [eiy]
|
704 |
+
::s1 zh ::s2 g ::cost 0.1 ::right2 [eiy] ::lc2 amh
|
705 |
+
::s1 zz ::s2 ts ::cost 0.15
|
706 |
+
::s1 zz ::s2 tz ::cost 0.1
|
707 |
+
|
708 |
+
# Oromo
|
709 |
+
::s1 nb ::s2 mb ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
|
710 |
+
::s1 np ::s2 mp ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
|
711 |
+
::s1 ph ::s2 p ::cost 0.3 ::lc1 orm ::lc2 orm
|
712 |
+
|
713 |
+
# Tigrinya
|
714 |
+
::s1 aaye ::s2 a ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
|
715 |
+
::s1 aaye ::s2 i ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
|
716 |
+
|
717 |
+
# Somali
|
718 |
+
::s1 ay ::s2 ey ::cost 0.1 ::lc1 som ::lc2 som
|
719 |
+
::s1 ay ::s2 eey ::cost 0.15 ::lc1 som ::lc2 som
|
720 |
+
::s1 aha ::s2 ihii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
721 |
+
::s1 aha ::s2 ihi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
722 |
+
::s1 aha ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
723 |
+
::s1 ihii ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
724 |
+
::s1 ihi ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
725 |
+
::s1 ha ::s2 hii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
726 |
+
::s1 ha ::s2 hi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
727 |
+
::s1 ha ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
728 |
+
::s1 hii ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
729 |
+
::s1 hi ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
730 |
+
::s1 aka ::s2 ikii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
731 |
+
::s1 aka ::s2 iki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
732 |
+
::s1 aka ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
733 |
+
::s1 ikii ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
734 |
+
::s1 iki ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
735 |
+
::s1 ka ::s2 kii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
736 |
+
::s1 ka ::s2 ki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
737 |
+
::s1 ka ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
738 |
+
::s1 kii ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
739 |
+
::s1 ki ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
740 |
+
::s1 aga ::s2 ugu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
741 |
+
::s1 ga ::s2 gu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
742 |
+
::s1 ata ::s2 itii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
743 |
+
::s1 ata ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
744 |
+
::s1 ata ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
745 |
+
::s1 itii ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
746 |
+
::s1 iti ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
747 |
+
::s1 ta ::s2 tii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
748 |
+
::s1 ta ::s2 ti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
749 |
+
::s1 ta ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
750 |
+
::s1 tii ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
751 |
+
::s1 ti ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
752 |
+
::s1 ata ::s2 ete ::cost 0.15 ::lc1 som ::lc2 som
|
753 |
+
::s1 ata ::s2 iti ::cost 0.2 ::lc1 som ::lc2 som
|
754 |
+
::s1 ete ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som
|
755 |
+
::s1 g ::s2 k ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
756 |
+
::s1 g ::s2 k ::cost 0.25 ::lc1 som ::lc2 som
|
757 |
+
::s1 g ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
|
758 |
+
::s1 gh ::s2 kh ::cost 0.1 ::lc1 som ::lc2 som
|
759 |
+
::s1 gh ::s2 k ::cost 0.2 ::lc1 som ::lc2 som
|
760 |
+
::s1 g ::s2 q ::cost 0.25 ::lc1 som ::lc2 som
|
761 |
+
::s1 g ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::right1 [aou] ::right2 [aou]
|
762 |
+
::s1 ga ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::left1 /^(.*[aeiou])?$/ ::left2 /^(.*[aeiou])?$/ ::right1 [bcdfghklmnpqrstvwxz] ::right2 [bcdfghklmnpqrstvwxz]
|
763 |
+
::s1 g ::s2 j ::cost 0.25 ::lc1 som ::lc2 som
|
764 |
+
::s1 g ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right1 [ei] ::right2 [ei]
|
765 |
+
::s1 gi ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right2 [ei]
|
766 |
+
::s1 n ::s2 m ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
767 |
+
::s1 n ::s2 mm ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
768 |
+
::s1 n ::s2 m ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
|
769 |
+
::s1 n ::s2 mm ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
|
770 |
+
::s1 ii ::s2 a ::cost 0.15 ::lc1 som ::lc2 som
|
771 |
+
::s1 y ::s2 dj ::cost 0.2 ::lc2 som
|
772 |
+
::s1 ca ::s2 a ::cost 0.15 ::left1 /^(.*[-, ])?$/ ::lc1 som
|
773 |
+
::s1 c ::s2 ::cost 0.25 ::left1 /^(.*[-, ])?$/ ::lc1 som
|
774 |
+
::s1 x ::s2 h ::cost 0.25 ::lc1 som
|
775 |
+
::s1 x ::s2 h ::cost 0.05 ::lc1 som ::left1 /^(.* )?$/ ::right1 [aeiou]
|
776 |
+
::s1 x ::s2 h ::cost 0.1 ::lc1 som ::left1 /[aeiou]$/
|
777 |
+
::s1 b ::s2 p ::cost 0.1 ::lc1 som
|
778 |
+
::s1 majm ::s2 mahm ::cost 0.1 ::lc1 som
|
779 |
+
::s1 chalim ::s2 halim ::cost 0.1 ::lc1 som ::lc2 som
|
780 |
+
::s1 chalim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
|
781 |
+
::s1 chalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
782 |
+
::s1 halim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
|
783 |
+
::s1 halim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
784 |
+
::s1 jalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
785 |
+
::s1 dh ::s2 r ::cost 0.25 ::lc1 som ::lc2 som ::left1 /[aeiou]$/
|
786 |
+
::s1 j ::s2 ch ::cost 0.25 ::lc1 som ::lc2 som
|
787 |
+
::s1 j ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
|
788 |
+
::s1 ch ::s2 sh ::cost 0.2 ::lc1 som ::lc2 som
|
789 |
+
|
790 |
+
# French
|
791 |
+
::s1 aud ::s2 o ::cost 0.3 ::right1 [-,$ ] ::lc1 eng, fra
|
792 |
+
::s1 aux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
793 |
+
::s1 eaux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
794 |
+
::s1 eux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
795 |
+
::s1 eux ::s2 e ::cost 0.15 ::right1 [-,$ ]
|
796 |
+
|
797 |
+
::s1 - ::s2 " " ::cost 0.1
|
798 |
+
::s1 : ::s2 , ::cost 0.1 ::lc1 amh
|
799 |
+
|
800 |
+
# mini dictionary Amharic-English
|
801 |
+
::s1 dabube ::s2 south ::cost 0 ::lc1 amh ::lc2 eng
|
802 |
+
::s1 daseete ::s2 island ::cost 0 ::lc1 amh ::lc2 eng
|
803 |
+
::s1 daseetoche ::s2 islands ::cost 0 ::lc1 amh ::lc2 eng
|
804 |
+
::s1 kaaweneti ::s2 county ::cost 0 ::lc1 amh ::lc2 eng
|
805 |
+
::s1 katamaa ::s2 city ::cost 0 ::lc1 amh ::lc2 eng
|
806 |
+
::s1 kelele ::s2 region ::cost 0 ::lc1 amh ::lc2 eng
|
807 |
+
::s1 meseraaqe ::s2 east ::cost 0 ::lc1 amh ::lc2 eng
|
808 |
+
::s1 sameene ::s2 north ::cost 0 ::lc1 amh ::lc2 eng
|
809 |
+
::s1 setaadiyame ::s2 stadium ::cost 0 ::lc1 amh ::lc2 eng
|
810 |
+
::s1 waneze ::s2 river ::cost 0 ::lc1 amh ::lc2 eng
|
811 |
+
|
812 |
+
# mini dictionary Arabic-English
|
813 |
+
::s1 " " ::s2 " of " ::cost 0 ::lc1 ara ::lc2 eng
|
814 |
+
::s1 " alawl" ::s2 " i" ::cost 0 ::lc1 ara ::lc2 eng ::right2 [-,$ ]
|
815 |
+
|
816 |
+
# mini dictionary Bengali-English
|
817 |
+
::s1 anychala ::s2 zone ::cost 0 ::lc1 ben ::lc2 eng
|
818 |
+
::s1 pradesha ::s2 province ::cost 0 ::lc1 ben ::lc2 eng
|
819 |
+
::s1 saamraajya ::s2 empire ::cost 0 ::lc1 ben ::lc2 eng
|
820 |
+
::s1 upajelaa ::s2 upazila ::cost 0 ::lc1 ben ::lc2 eng
|
821 |
+
::s1 uttara ::s2 north ::cost 0 ::lc1 ben ::lc2 eng
|
822 |
+
::s1 "dya " ::s2 "the " ::left1 /^(.*[-, ])?$/ ::cost 0.2 ::lc1 ben ::lc2 eng
|
823 |
+
::s1 " aba " ::s2 " of " ::cost 0 ::lc1 ben ::lc2 eng
|
824 |
+
|
825 |
+
# mini dictionary Russian-English
|
826 |
+
::s1 akademiya ::s2 academy ::cost 0 ::lc1 rus ::lc2 eng
|
827 |
+
::s1 eparkhiya ::s2 diocese ::cost 0 ::lc1 rus ::lc2 eng
|
828 |
+
::s1 gorod ::s2 city ::cost 0 ::lc1 rus ::lc2 eng
|
829 |
+
::s1 gosudarstvennyi ::s2 state ::cost 0 ::lc1 rus ::lc2 eng
|
830 |
+
::s1 gubernator ::s2 governor ::cost 0 ::lc1 rus ::lc2 eng
|
831 |
+
::s1 guberniya ::s2 governate ::cost 0 ::lc1 rus ::lc2 eng
|
832 |
+
::s1 imperator ::s2 emperor ::cost 0 ::lc1 rus ::lc2 eng
|
833 |
+
::s1 komitet ::s2 committee ::cost 0 ::lc1 rus ::lc2 eng
|
834 |
+
::s1 korolevstvo ::s2 kingdom ::cost 0 ::lc1 rus ::lc2 eng
|
835 |
+
::s1 koroli ::s2 king ::cost 0 ::lc1 rus ::lc2 eng
|
836 |
+
::s1 mezhdunarodnaya ::s2 international ::cost 0 ::lc1 rus ::lc2 eng
|
837 |
+
::s1 natsionalnyi ::s2 national ::cost 0 ::lc1 rus ::lc2 eng
|
838 |
+
::s1 novyi ::s2 new ::cost 0 ::lc1 rus ::lc2 eng
|
839 |
+
::s1 oblast ::s2 province ::cost 0 ::lc1 rus ::lc2 eng
|
840 |
+
::s1 oblast ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
|
841 |
+
::s1 obshchestvo ::s2 society ::cost 0 ::lc1 rus ::lc2 eng
|
842 |
+
::s1 okrug ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
|
843 |
+
::s1 okrug ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
|
844 |
+
::s1 ostrova ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
|
845 |
+
::s1 partiya ::s2 party ::cost 0 ::lc1 rus ::lc2 eng
|
846 |
+
::s1 raion ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
|
847 |
+
::s1 respublika ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
|
848 |
+
::s1 respublik ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
|
849 |
+
::s1 sbornaya ::s2 team ::cost 0 ::lc1 rus ::lc2 eng
|
850 |
+
::s1 severnaya ::s2 north ::cost 0 ::lc1 rus ::lc2 eng
|
851 |
+
::s1 sovet council ::cost 0 ::lc1 rus ::lc2 eng
|
852 |
+
::s1 soyuz ::s2 alliance ::cost 0 ::lc1 rus ::lc2 eng
|
853 |
+
::s1 soyuz ::s2 association ::cost 0 ::lc1 rus ::lc2 eng
|
854 |
+
::s1 soyuz ::s2 league ::cost 0 ::lc1 rus ::lc2 eng
|
855 |
+
::s1 soyuz ::s2 union ::cost 0 ::lc1 rus ::lc2 eng
|
856 |
+
::s1 svyataya ::s2 saint ::cost 0 ::lc1 rus ::lc2 eng
|
857 |
+
::s1 svobodnyi ::s2 free ::cost 0 ::lc1 rus ::lc2 eng
|
858 |
+
::s1 tserkov ::s2 church ::cost 0 ::lc1 rus ::lc2 eng
|
859 |
+
::s1 uezd ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
|
860 |
+
::s1 universitet ::s2 university ::cost 0 ::lc1 rus ::lc2 eng
|
861 |
+
::s1 vostochnaya ::s2 east ::cost 0 ::lc1 rus ::lc2 eng
|
862 |
+
::s1 vostochnaya ::s2 eastern ::cost 0 ::lc1 rus ::lc2 eng
|
863 |
+
::s1 yuzhnaya ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
864 |
+
::s1 yuzhnaya ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
|
865 |
+
::s1 yuzhnoi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
866 |
+
::s1 yuzhnoi ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
|
867 |
+
::s1 yuzhnyi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
868 |
+
# often dropped in Russian name
|
869 |
+
::s1 ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
|
870 |
+
::s1 ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
|
871 |
+
::s1 ::s2 pope ::cost 0 ::lc1 rus ::lc2 eng
|
872 |
+
::s1 ::s2 river ::cost 0 ::lc1 rus ::lc2 eng
|
873 |
+
::s1 ::s2 "the " ::cost 0 ::lc1 rus ::lc2 eng ::left2 /^(.*[- ])?$/
|
874 |
+
::s1 " " ::s2 " of " ::cost 0 ::lc1 rus ::lc2 eng
|
875 |
+
|
876 |
+
|
877 |
+
# mini dictionary Uyghur-English
|
878 |
+
::s1 aptonom ::s2 automomous ::cost 0 ::lc1 uig ::lc2 eng
|
879 |
+
::s1 aralliri ::s2 islands ::cost 0 ::lc1 uig ::lc2 eng
|
880 |
+
::s1 aralliri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
881 |
+
::s1 arili ::s2 island ::cost 0 ::lc1 uig ::lc2 eng
|
882 |
+
::s1 arili ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
883 |
+
::s1 nahiyisi ::s2 county ::cost 0 ::lc1 uig ::lc2 eng
|
884 |
+
::s1 oelkisi ::s2 province ::cost 0 ::lc1 uig ::lc2 eng
|
885 |
+
::s1 oelkisi ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
886 |
+
::s1 ottura ::s2 central ::cost 0 ::lc1 uig ::lc2 eng
|
887 |
+
::s1 rayoni ::s2 region ::cost 0 ::lc1 uig ::lc2 eng
|
888 |
+
::s1 shehiri ::s2 city ::cost 0 ::lc1 uig ::lc2 eng
|
889 |
+
::s1 shehiri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
890 |
+
::s1 shitati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
|
891 |
+
::s1 shitati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
892 |
+
::s1 shtati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
|
893 |
+
::s1 shtati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
894 |
+
::s1 uniwersiteti ::s2 university ::cost 0 ::lc1 uig ::lc2 eng
|
895 |
+
::s1 yengi ::s2 new ::cost 0 ::lc1 uig ::lc2 eng
|
896 |
+
|
uroman/lib/JSON.pm
ADDED
@@ -0,0 +1,2317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package JSON;
|
2 |
+
|
3 |
+
|
4 |
+
use strict;
|
5 |
+
use Carp ();
|
6 |
+
use base qw(Exporter);
|
7 |
+
@JSON::EXPORT = qw(from_json to_json jsonToObj objToJson encode_json decode_json);
|
8 |
+
|
9 |
+
BEGIN {
|
10 |
+
$JSON::VERSION = '2.90';
|
11 |
+
$JSON::DEBUG = 0 unless (defined $JSON::DEBUG);
|
12 |
+
$JSON::DEBUG = $ENV{ PERL_JSON_DEBUG } if exists $ENV{ PERL_JSON_DEBUG };
|
13 |
+
}
|
14 |
+
|
15 |
+
my $Module_XS = 'JSON::XS';
|
16 |
+
my $Module_PP = 'JSON::PP';
|
17 |
+
my $Module_bp = 'JSON::backportPP'; # included in JSON distribution
|
18 |
+
my $PP_Version = '2.27203';
|
19 |
+
my $XS_Version = '2.34';
|
20 |
+
|
21 |
+
|
22 |
+
# XS and PP common methods
|
23 |
+
|
24 |
+
my @PublicMethods = qw/
|
25 |
+
ascii latin1 utf8 pretty indent space_before space_after relaxed canonical allow_nonref
|
26 |
+
allow_blessed convert_blessed filter_json_object filter_json_single_key_object
|
27 |
+
shrink max_depth max_size encode decode decode_prefix allow_unknown
|
28 |
+
/;
|
29 |
+
|
30 |
+
my @Properties = qw/
|
31 |
+
ascii latin1 utf8 indent space_before space_after relaxed canonical allow_nonref
|
32 |
+
allow_blessed convert_blessed shrink max_depth max_size allow_unknown
|
33 |
+
/;
|
34 |
+
|
35 |
+
my @XSOnlyMethods = qw/allow_tags/; # Currently nothing
|
36 |
+
|
37 |
+
my @PPOnlyMethods = qw/
|
38 |
+
indent_length sort_by
|
39 |
+
allow_singlequote allow_bignum loose allow_barekey escape_slash as_nonblessed
|
40 |
+
/; # JSON::PP specific
|
41 |
+
|
42 |
+
|
43 |
+
# used in _load_xs and _load_pp ($INSTALL_ONLY is not used currently)
|
44 |
+
my $_INSTALL_DONT_DIE = 1; # When _load_xs fails to load XS, don't die.
|
45 |
+
my $_INSTALL_ONLY = 2; # Don't call _set_methods()
|
46 |
+
my $_ALLOW_UNSUPPORTED = 0;
|
47 |
+
my $_UNIV_CONV_BLESSED = 0;
|
48 |
+
my $_USSING_bpPP = 0;
|
49 |
+
|
50 |
+
|
51 |
+
# Check the environment variable to decide worker module.
|
52 |
+
|
53 |
+
unless ($JSON::Backend) {
|
54 |
+
$JSON::DEBUG and Carp::carp("Check used worker module...");
|
55 |
+
|
56 |
+
my $backend = exists $ENV{PERL_JSON_BACKEND} ? $ENV{PERL_JSON_BACKEND} : 1;
|
57 |
+
|
58 |
+
if ($backend eq '1' or $backend =~ /JSON::XS\s*,\s*JSON::PP/) {
|
59 |
+
_load_xs($_INSTALL_DONT_DIE) or _load_pp();
|
60 |
+
}
|
61 |
+
elsif ($backend eq '0' or $backend eq 'JSON::PP') {
|
62 |
+
_load_pp();
|
63 |
+
}
|
64 |
+
elsif ($backend eq '2' or $backend eq 'JSON::XS') {
|
65 |
+
_load_xs();
|
66 |
+
}
|
67 |
+
elsif ($backend eq 'JSON::backportPP') {
|
68 |
+
$_USSING_bpPP = 1;
|
69 |
+
_load_pp();
|
70 |
+
}
|
71 |
+
else {
|
72 |
+
Carp::croak "The value of environmental variable 'PERL_JSON_BACKEND' is invalid.";
|
73 |
+
}
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
sub import {
|
78 |
+
my $pkg = shift;
|
79 |
+
my @what_to_export;
|
80 |
+
my $no_export;
|
81 |
+
|
82 |
+
for my $tag (@_) {
|
83 |
+
if ($tag eq '-support_by_pp') {
|
84 |
+
if (!$_ALLOW_UNSUPPORTED++) {
|
85 |
+
JSON::Backend::XS
|
86 |
+
->support_by_pp(@PPOnlyMethods) if ($JSON::Backend eq $Module_XS);
|
87 |
+
}
|
88 |
+
next;
|
89 |
+
}
|
90 |
+
elsif ($tag eq '-no_export') {
|
91 |
+
$no_export++, next;
|
92 |
+
}
|
93 |
+
elsif ( $tag eq '-convert_blessed_universally' ) {
|
94 |
+
eval q|
|
95 |
+
require B;
|
96 |
+
*UNIVERSAL::TO_JSON = sub {
|
97 |
+
my $b_obj = B::svref_2object( $_[0] );
|
98 |
+
return $b_obj->isa('B::HV') ? { %{ $_[0] } }
|
99 |
+
: $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
|
100 |
+
: undef
|
101 |
+
;
|
102 |
+
}
|
103 |
+
| if ( !$_UNIV_CONV_BLESSED++ );
|
104 |
+
next;
|
105 |
+
}
|
106 |
+
push @what_to_export, $tag;
|
107 |
+
}
|
108 |
+
|
109 |
+
return if ($no_export);
|
110 |
+
|
111 |
+
__PACKAGE__->export_to_level(1, $pkg, @what_to_export);
|
112 |
+
}
|
113 |
+
|
114 |
+
|
115 |
+
# OBSOLETED
|
116 |
+
|
117 |
+
sub jsonToObj {
|
118 |
+
my $alternative = 'from_json';
|
119 |
+
if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
|
120 |
+
shift @_; $alternative = 'decode';
|
121 |
+
}
|
122 |
+
Carp::carp "'jsonToObj' will be obsoleted. Please use '$alternative' instead.";
|
123 |
+
return JSON::from_json(@_);
|
124 |
+
};
|
125 |
+
|
126 |
+
sub objToJson {
|
127 |
+
my $alternative = 'to_json';
|
128 |
+
if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
|
129 |
+
shift @_; $alternative = 'encode';
|
130 |
+
}
|
131 |
+
Carp::carp "'objToJson' will be obsoleted. Please use '$alternative' instead.";
|
132 |
+
JSON::to_json(@_);
|
133 |
+
};
|
134 |
+
|
135 |
+
|
136 |
+
# INTERFACES
|
137 |
+
|
138 |
+
sub to_json ($@) {
|
139 |
+
if (
|
140 |
+
ref($_[0]) eq 'JSON'
|
141 |
+
or (@_ > 2 and $_[0] eq 'JSON')
|
142 |
+
) {
|
143 |
+
Carp::croak "to_json should not be called as a method.";
|
144 |
+
}
|
145 |
+
my $json = JSON->new;
|
146 |
+
|
147 |
+
if (@_ == 2 and ref $_[1] eq 'HASH') {
|
148 |
+
my $opt = $_[1];
|
149 |
+
for my $method (keys %$opt) {
|
150 |
+
$json->$method( $opt->{$method} );
|
151 |
+
}
|
152 |
+
}
|
153 |
+
|
154 |
+
$json->encode($_[0]);
|
155 |
+
}
|
156 |
+
|
157 |
+
|
158 |
+
sub from_json ($@) {
|
159 |
+
if ( ref($_[0]) eq 'JSON' or $_[0] eq 'JSON' ) {
|
160 |
+
Carp::croak "from_json should not be called as a method.";
|
161 |
+
}
|
162 |
+
my $json = JSON->new;
|
163 |
+
|
164 |
+
if (@_ == 2 and ref $_[1] eq 'HASH') {
|
165 |
+
my $opt = $_[1];
|
166 |
+
for my $method (keys %$opt) {
|
167 |
+
$json->$method( $opt->{$method} );
|
168 |
+
}
|
169 |
+
}
|
170 |
+
|
171 |
+
return $json->decode( $_[0] );
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
sub true { $JSON::true }
|
177 |
+
|
178 |
+
sub false { $JSON::false }
|
179 |
+
|
180 |
+
sub null { undef; }
|
181 |
+
|
182 |
+
|
183 |
+
sub require_xs_version { $XS_Version; }
|
184 |
+
|
185 |
+
sub backend {
|
186 |
+
my $proto = shift;
|
187 |
+
$JSON::Backend;
|
188 |
+
}
|
189 |
+
|
190 |
+
#*module = *backend;
|
191 |
+
|
192 |
+
|
193 |
+
sub is_xs {
|
194 |
+
return $_[0]->backend eq $Module_XS;
|
195 |
+
}
|
196 |
+
|
197 |
+
|
198 |
+
sub is_pp {
|
199 |
+
return not $_[0]->is_xs;
|
200 |
+
}
|
201 |
+
|
202 |
+
|
203 |
+
sub pureperl_only_methods { @PPOnlyMethods; }
|
204 |
+
|
205 |
+
|
206 |
+
sub property {
|
207 |
+
my ($self, $name, $value) = @_;
|
208 |
+
|
209 |
+
if (@_ == 1) {
|
210 |
+
my %props;
|
211 |
+
for $name (@Properties) {
|
212 |
+
my $method = 'get_' . $name;
|
213 |
+
if ($name eq 'max_size') {
|
214 |
+
my $value = $self->$method();
|
215 |
+
$props{$name} = $value == 1 ? 0 : $value;
|
216 |
+
next;
|
217 |
+
}
|
218 |
+
$props{$name} = $self->$method();
|
219 |
+
}
|
220 |
+
return \%props;
|
221 |
+
}
|
222 |
+
elsif (@_ > 3) {
|
223 |
+
Carp::croak('property() can take only the option within 2 arguments.');
|
224 |
+
}
|
225 |
+
elsif (@_ == 2) {
|
226 |
+
if ( my $method = $self->can('get_' . $name) ) {
|
227 |
+
if ($name eq 'max_size') {
|
228 |
+
my $value = $self->$method();
|
229 |
+
return $value == 1 ? 0 : $value;
|
230 |
+
}
|
231 |
+
$self->$method();
|
232 |
+
}
|
233 |
+
}
|
234 |
+
else {
|
235 |
+
$self->$name($value);
|
236 |
+
}
|
237 |
+
|
238 |
+
}
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
# INTERNAL
|
243 |
+
|
244 |
+
sub _load_xs {
|
245 |
+
my $opt = shift;
|
246 |
+
|
247 |
+
$JSON::DEBUG and Carp::carp "Load $Module_XS.";
|
248 |
+
|
249 |
+
# if called after install module, overload is disable.... why?
|
250 |
+
JSON::Boolean::_overrride_overload($Module_XS);
|
251 |
+
JSON::Boolean::_overrride_overload($Module_PP);
|
252 |
+
|
253 |
+
eval qq|
|
254 |
+
use $Module_XS $XS_Version ();
|
255 |
+
|;
|
256 |
+
|
257 |
+
if ($@) {
|
258 |
+
if (defined $opt and $opt & $_INSTALL_DONT_DIE) {
|
259 |
+
$JSON::DEBUG and Carp::carp "Can't load $Module_XS...($@)";
|
260 |
+
return 0;
|
261 |
+
}
|
262 |
+
Carp::croak $@;
|
263 |
+
}
|
264 |
+
|
265 |
+
unless (defined $opt and $opt & $_INSTALL_ONLY) {
|
266 |
+
_set_module( $JSON::Backend = $Module_XS );
|
267 |
+
my $data = join("", <DATA>); # this code is from Jcode 2.xx.
|
268 |
+
close(DATA);
|
269 |
+
eval $data;
|
270 |
+
JSON::Backend::XS->init;
|
271 |
+
}
|
272 |
+
|
273 |
+
return 1;
|
274 |
+
};
|
275 |
+
|
276 |
+
|
277 |
+
sub _load_pp {
|
278 |
+
my $opt = shift;
|
279 |
+
my $backend = $_USSING_bpPP ? $Module_bp : $Module_PP;
|
280 |
+
|
281 |
+
$JSON::DEBUG and Carp::carp "Load $backend.";
|
282 |
+
|
283 |
+
# if called after install module, overload is disable.... why?
|
284 |
+
JSON::Boolean::_overrride_overload($Module_XS);
|
285 |
+
JSON::Boolean::_overrride_overload($backend);
|
286 |
+
|
287 |
+
if ( $_USSING_bpPP ) {
|
288 |
+
eval qq| require $backend |;
|
289 |
+
}
|
290 |
+
else {
|
291 |
+
eval qq| use $backend $PP_Version () |;
|
292 |
+
}
|
293 |
+
|
294 |
+
if ($@) {
|
295 |
+
if ( $backend eq $Module_PP ) {
|
296 |
+
$JSON::DEBUG and Carp::carp "Can't load $Module_PP ($@), so try to load $Module_bp";
|
297 |
+
$_USSING_bpPP++;
|
298 |
+
$backend = $Module_bp;
|
299 |
+
JSON::Boolean::_overrride_overload($backend);
|
300 |
+
local $^W; # if PP installed but invalid version, backportPP redefines methods.
|
301 |
+
eval qq| require $Module_bp |;
|
302 |
+
}
|
303 |
+
Carp::croak $@ if $@;
|
304 |
+
}
|
305 |
+
|
306 |
+
unless (defined $opt and $opt & $_INSTALL_ONLY) {
|
307 |
+
_set_module( $JSON::Backend = $Module_PP ); # even if backportPP, set $Backend with 'JSON::PP'
|
308 |
+
JSON::Backend::PP->init;
|
309 |
+
}
|
310 |
+
};
|
311 |
+
|
312 |
+
|
313 |
+
sub _set_module {
|
314 |
+
return if defined $JSON::true;
|
315 |
+
|
316 |
+
my $module = shift;
|
317 |
+
|
318 |
+
local $^W;
|
319 |
+
no strict qw(refs);
|
320 |
+
|
321 |
+
$JSON::true = ${"$module\::true"};
|
322 |
+
$JSON::false = ${"$module\::false"};
|
323 |
+
|
324 |
+
push @JSON::ISA, $module;
|
325 |
+
if ( JSON->is_xs and JSON->backend->VERSION < 3 ) {
|
326 |
+
eval 'package JSON::PP::Boolean';
|
327 |
+
push @{"$module\::Boolean::ISA"}, qw(JSON::PP::Boolean);
|
328 |
+
}
|
329 |
+
|
330 |
+
*{"JSON::is_bool"} = \&{"$module\::is_bool"};
|
331 |
+
|
332 |
+
for my $method ($module eq $Module_XS ? @PPOnlyMethods : @XSOnlyMethods) {
|
333 |
+
*{"JSON::$method"} = sub {
|
334 |
+
Carp::carp("$method is not supported in $module.");
|
335 |
+
$_[0];
|
336 |
+
};
|
337 |
+
}
|
338 |
+
|
339 |
+
return 1;
|
340 |
+
}
|
341 |
+
|
342 |
+
|
343 |
+
|
344 |
+
#
|
345 |
+
# JSON Boolean
|
346 |
+
#
|
347 |
+
|
348 |
+
package JSON::Boolean;
|
349 |
+
|
350 |
+
my %Installed;
|
351 |
+
|
352 |
+
sub _overrride_overload {
|
353 |
+
return; # this function is currently disable.
|
354 |
+
return if ($Installed{ $_[0] }++);
|
355 |
+
|
356 |
+
my $boolean = $_[0] . '::Boolean';
|
357 |
+
|
358 |
+
eval sprintf(q|
|
359 |
+
package %s;
|
360 |
+
use overload (
|
361 |
+
'""' => sub { ${$_[0]} == 1 ? 'true' : 'false' },
|
362 |
+
'eq' => sub {
|
363 |
+
my ($obj, $op) = ref ($_[0]) ? ($_[0], $_[1]) : ($_[1], $_[0]);
|
364 |
+
if ($op eq 'true' or $op eq 'false') {
|
365 |
+
return "$obj" eq 'true' ? 'true' eq $op : 'false' eq $op;
|
366 |
+
}
|
367 |
+
else {
|
368 |
+
return $obj ? 1 == $op : 0 == $op;
|
369 |
+
}
|
370 |
+
},
|
371 |
+
);
|
372 |
+
|, $boolean);
|
373 |
+
|
374 |
+
if ($@) { Carp::croak $@; }
|
375 |
+
|
376 |
+
if ( exists $INC{'JSON/XS.pm'} and $boolean eq 'JSON::XS::Boolean' ) {
|
377 |
+
local $^W;
|
378 |
+
my $true = do { bless \(my $dummy = 1), $boolean };
|
379 |
+
my $false = do { bless \(my $dummy = 0), $boolean };
|
380 |
+
*JSON::XS::true = sub () { $true };
|
381 |
+
*JSON::XS::false = sub () { $false };
|
382 |
+
}
|
383 |
+
elsif ( exists $INC{'JSON/PP.pm'} and $boolean eq 'JSON::PP::Boolean' ) {
|
384 |
+
local $^W;
|
385 |
+
my $true = do { bless \(my $dummy = 1), $boolean };
|
386 |
+
my $false = do { bless \(my $dummy = 0), $boolean };
|
387 |
+
*JSON::PP::true = sub { $true };
|
388 |
+
*JSON::PP::false = sub { $false };
|
389 |
+
}
|
390 |
+
|
391 |
+
return 1;
|
392 |
+
}
|
393 |
+
|
394 |
+
|
395 |
+
#
|
396 |
+
# Helper classes for Backend Module (PP)
|
397 |
+
#
|
398 |
+
|
399 |
+
package JSON::Backend::PP;
|
400 |
+
|
401 |
+
sub init {
|
402 |
+
local $^W;
|
403 |
+
no strict qw(refs); # this routine may be called after JSON::Backend::XS init was called.
|
404 |
+
*{"JSON::decode_json"} = \&{"JSON::PP::decode_json"};
|
405 |
+
*{"JSON::encode_json"} = \&{"JSON::PP::encode_json"};
|
406 |
+
*{"JSON::PP::is_xs"} = sub { 0 };
|
407 |
+
*{"JSON::PP::is_pp"} = sub { 1 };
|
408 |
+
return 1;
|
409 |
+
}
|
410 |
+
|
411 |
+
#
|
412 |
+
# To save memory, the below lines are read only when XS backend is used.
|
413 |
+
#
|
414 |
+
|
415 |
+
package JSON;
|
416 |
+
|
417 |
+
1;
|
418 |
+
__DATA__
|
419 |
+
|
420 |
+
|
421 |
+
#
|
422 |
+
# Helper classes for Backend Module (XS)
|
423 |
+
#
|
424 |
+
|
425 |
+
package JSON::Backend::XS;
|
426 |
+
|
427 |
+
use constant INDENT_LENGTH_FLAG => 15 << 12;
|
428 |
+
|
429 |
+
use constant UNSUPPORTED_ENCODE_FLAG => {
|
430 |
+
ESCAPE_SLASH => 0x00000010,
|
431 |
+
ALLOW_BIGNUM => 0x00000020,
|
432 |
+
AS_NONBLESSED => 0x00000040,
|
433 |
+
EXPANDED => 0x10000000, # for developer's
|
434 |
+
};
|
435 |
+
|
436 |
+
use constant UNSUPPORTED_DECODE_FLAG => {
|
437 |
+
LOOSE => 0x00000001,
|
438 |
+
ALLOW_BIGNUM => 0x00000002,
|
439 |
+
ALLOW_BAREKEY => 0x00000004,
|
440 |
+
ALLOW_SINGLEQUOTE => 0x00000008,
|
441 |
+
EXPANDED => 0x20000000, # for developer's
|
442 |
+
};
|
443 |
+
|
444 |
+
|
445 |
+
sub init {
|
446 |
+
local $^W;
|
447 |
+
no strict qw(refs);
|
448 |
+
*{"JSON::decode_json"} = \&{"JSON::XS::decode_json"};
|
449 |
+
*{"JSON::encode_json"} = \&{"JSON::XS::encode_json"};
|
450 |
+
*{"JSON::XS::is_xs"} = sub { 1 };
|
451 |
+
*{"JSON::XS::is_pp"} = sub { 0 };
|
452 |
+
return 1;
|
453 |
+
}
|
454 |
+
|
455 |
+
|
456 |
+
sub support_by_pp {
|
457 |
+
my ($class, @methods) = @_;
|
458 |
+
|
459 |
+
local $^W;
|
460 |
+
no strict qw(refs);
|
461 |
+
|
462 |
+
my $JSON_XS_encode_orignal = \&JSON::XS::encode;
|
463 |
+
my $JSON_XS_decode_orignal = \&JSON::XS::decode;
|
464 |
+
my $JSON_XS_incr_parse_orignal = \&JSON::XS::incr_parse;
|
465 |
+
|
466 |
+
*JSON::XS::decode = \&JSON::Backend::XS::Supportable::_decode;
|
467 |
+
*JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
|
468 |
+
*JSON::XS::incr_parse = \&JSON::Backend::XS::Supportable::_incr_parse;
|
469 |
+
|
470 |
+
*{JSON::XS::_original_decode} = $JSON_XS_decode_orignal;
|
471 |
+
*{JSON::XS::_original_encode} = $JSON_XS_encode_orignal;
|
472 |
+
*{JSON::XS::_original_incr_parse} = $JSON_XS_incr_parse_orignal;
|
473 |
+
|
474 |
+
push @JSON::Backend::XS::Supportable::ISA, 'JSON';
|
475 |
+
|
476 |
+
my $pkg = 'JSON::Backend::XS::Supportable';
|
477 |
+
|
478 |
+
*{JSON::new} = sub {
|
479 |
+
my $proto = JSON::XS->new; $$proto = 0;
|
480 |
+
bless $proto, $pkg;
|
481 |
+
};
|
482 |
+
|
483 |
+
|
484 |
+
for my $method (@methods) {
|
485 |
+
my $flag = uc($method);
|
486 |
+
my $type |= (UNSUPPORTED_ENCODE_FLAG->{$flag} || 0);
|
487 |
+
$type |= (UNSUPPORTED_DECODE_FLAG->{$flag} || 0);
|
488 |
+
|
489 |
+
next unless($type);
|
490 |
+
|
491 |
+
$pkg->_make_unsupported_method($method => $type);
|
492 |
+
}
|
493 |
+
|
494 |
+
# push @{"JSON::XS::Boolean::ISA"}, qw(JSON::PP::Boolean);
|
495 |
+
# push @{"JSON::PP::Boolean::ISA"}, qw(JSON::Boolean);
|
496 |
+
|
497 |
+
$JSON::DEBUG and Carp::carp("set -support_by_pp mode.");
|
498 |
+
|
499 |
+
return 1;
|
500 |
+
}
|
501 |
+
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
+
#
|
506 |
+
# Helper classes for XS
|
507 |
+
#
|
508 |
+
|
509 |
+
package JSON::Backend::XS::Supportable;
|
510 |
+
|
511 |
+
$Carp::Internal{'JSON::Backend::XS::Supportable'} = 1;
|
512 |
+
|
513 |
+
sub _make_unsupported_method {
|
514 |
+
my ($pkg, $method, $type) = @_;
|
515 |
+
|
516 |
+
local $^W;
|
517 |
+
no strict qw(refs);
|
518 |
+
|
519 |
+
*{"$pkg\::$method"} = sub {
|
520 |
+
local $^W;
|
521 |
+
if (defined $_[1] ? $_[1] : 1) {
|
522 |
+
${$_[0]} |= $type;
|
523 |
+
}
|
524 |
+
else {
|
525 |
+
${$_[0]} &= ~$type;
|
526 |
+
}
|
527 |
+
$_[0];
|
528 |
+
};
|
529 |
+
|
530 |
+
*{"$pkg\::get_$method"} = sub {
|
531 |
+
${$_[0]} & $type ? 1 : '';
|
532 |
+
};
|
533 |
+
|
534 |
+
}
|
535 |
+
|
536 |
+
|
537 |
+
sub _set_for_pp {
|
538 |
+
JSON::_load_pp( $_INSTALL_ONLY );
|
539 |
+
|
540 |
+
my $type = shift;
|
541 |
+
my $pp = JSON::PP->new;
|
542 |
+
my $prop = $_[0]->property;
|
543 |
+
|
544 |
+
for my $name (keys %$prop) {
|
545 |
+
$pp->$name( $prop->{$name} ? $prop->{$name} : 0 );
|
546 |
+
}
|
547 |
+
|
548 |
+
my $unsupported = $type eq 'encode' ? JSON::Backend::XS::UNSUPPORTED_ENCODE_FLAG
|
549 |
+
: JSON::Backend::XS::UNSUPPORTED_DECODE_FLAG;
|
550 |
+
my $flags = ${$_[0]} || 0;
|
551 |
+
|
552 |
+
for my $name (keys %$unsupported) {
|
553 |
+
next if ($name eq 'EXPANDED'); # for developer's
|
554 |
+
my $enable = ($flags & $unsupported->{$name}) ? 1 : 0;
|
555 |
+
my $method = lc $name;
|
556 |
+
$pp->$method($enable);
|
557 |
+
}
|
558 |
+
|
559 |
+
$pp->indent_length( $_[0]->get_indent_length );
|
560 |
+
|
561 |
+
return $pp;
|
562 |
+
}
|
563 |
+
|
564 |
+
sub _encode { # using with PP encode
|
565 |
+
if (${$_[0]}) {
|
566 |
+
_set_for_pp('encode' => @_)->encode($_[1]);
|
567 |
+
}
|
568 |
+
else {
|
569 |
+
$_[0]->_original_encode( $_[1] );
|
570 |
+
}
|
571 |
+
}
|
572 |
+
|
573 |
+
|
574 |
+
sub _decode { # if unsupported-flag is set, use PP
|
575 |
+
if (${$_[0]}) {
|
576 |
+
_set_for_pp('decode' => @_)->decode($_[1]);
|
577 |
+
}
|
578 |
+
else {
|
579 |
+
$_[0]->_original_decode( $_[1] );
|
580 |
+
}
|
581 |
+
}
|
582 |
+
|
583 |
+
|
584 |
+
sub decode_prefix { # if unsupported-flag is set, use PP
|
585 |
+
_set_for_pp('decode' => @_)->decode_prefix($_[1]);
|
586 |
+
}
|
587 |
+
|
588 |
+
|
589 |
+
sub _incr_parse {
|
590 |
+
if (${$_[0]}) {
|
591 |
+
_set_for_pp('decode' => @_)->incr_parse($_[1]);
|
592 |
+
}
|
593 |
+
else {
|
594 |
+
$_[0]->_original_incr_parse( $_[1] );
|
595 |
+
}
|
596 |
+
}
|
597 |
+
|
598 |
+
|
599 |
+
sub get_indent_length {
|
600 |
+
${$_[0]} << 4 >> 16;
|
601 |
+
}
|
602 |
+
|
603 |
+
|
604 |
+
sub indent_length {
|
605 |
+
my $length = $_[1];
|
606 |
+
|
607 |
+
if (!defined $length or $length > 15 or $length < 0) {
|
608 |
+
Carp::carp "The acceptable range of indent_length() is 0 to 15.";
|
609 |
+
}
|
610 |
+
else {
|
611 |
+
local $^W;
|
612 |
+
$length <<= 12;
|
613 |
+
${$_[0]} &= ~ JSON::Backend::XS::INDENT_LENGTH_FLAG;
|
614 |
+
${$_[0]} |= $length;
|
615 |
+
*JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
|
616 |
+
}
|
617 |
+
|
618 |
+
$_[0];
|
619 |
+
}
|
620 |
+
|
621 |
+
|
622 |
+
1;
|
623 |
+
__END__
|
624 |
+
|
625 |
+
=head1 NAME
|
626 |
+
|
627 |
+
JSON - JSON (JavaScript Object Notation) encoder/decoder
|
628 |
+
|
629 |
+
=head1 SYNOPSIS
|
630 |
+
|
631 |
+
use JSON; # imports encode_json, decode_json, to_json and from_json.
|
632 |
+
|
633 |
+
# simple and fast interfaces (expect/generate UTF-8)
|
634 |
+
|
635 |
+
$utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
|
636 |
+
$perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
|
637 |
+
|
638 |
+
# OO-interface
|
639 |
+
|
640 |
+
$json = JSON->new->allow_nonref;
|
641 |
+
|
642 |
+
$json_text = $json->encode( $perl_scalar );
|
643 |
+
$perl_scalar = $json->decode( $json_text );
|
644 |
+
|
645 |
+
$pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
|
646 |
+
|
647 |
+
# If you want to use PP only support features, call with '-support_by_pp'
|
648 |
+
# When XS unsupported feature is enable, using PP (de|en)code instead of XS ones.
|
649 |
+
|
650 |
+
use JSON -support_by_pp;
|
651 |
+
|
652 |
+
# option-acceptable interfaces (expect/generate UNICODE by default)
|
653 |
+
|
654 |
+
$json_text = to_json( $perl_scalar, { ascii => 1, pretty => 1 } );
|
655 |
+
$perl_scalar = from_json( $json_text, { utf8 => 1 } );
|
656 |
+
|
657 |
+
# Between (en|de)code_json and (to|from)_json, if you want to write
|
658 |
+
# a code which communicates to an outer world (encoded in UTF-8),
|
659 |
+
# recommend to use (en|de)code_json.
|
660 |
+
|
661 |
+
=head1 VERSION
|
662 |
+
|
663 |
+
2.90
|
664 |
+
|
665 |
+
This version is compatible with JSON::XS B<2.34> and later.
|
666 |
+
(Not yet compatble to JSON::XS B<3.0x>.)
|
667 |
+
|
668 |
+
|
669 |
+
=head1 NOTE
|
670 |
+
|
671 |
+
JSON::PP was earlier included in the C<JSON> distribution, but
|
672 |
+
has since Perl 5.14 been a core module. For this reason,
|
673 |
+
L<JSON::PP> was removed from the JSON distribution and can now
|
674 |
+
be found also in the Perl5 repository at
|
675 |
+
|
676 |
+
=over
|
677 |
+
|
678 |
+
=item * L<http://perl5.git.perl.org/perl.git>
|
679 |
+
|
680 |
+
=back
|
681 |
+
|
682 |
+
(The newest JSON::PP version still exists in CPAN.)
|
683 |
+
|
684 |
+
Instead, the C<JSON> distribution will include JSON::backportPP
|
685 |
+
for backwards computability. JSON.pm should thus work as it did
|
686 |
+
before.
|
687 |
+
|
688 |
+
=head1 DESCRIPTION
|
689 |
+
|
690 |
+
*************************** CAUTION **************************************
|
691 |
+
* *
|
692 |
+
* INCOMPATIBLE CHANGE (JSON::XS version 2.90) *
|
693 |
+
* *
|
694 |
+
* JSON.pm had patched JSON::XS::Boolean and JSON::PP::Boolean internally *
|
695 |
+
* on loading time for making these modules inherit JSON::Boolean. *
|
696 |
+
* But since JSON::XS v3.0 it use Types::Serialiser as boolean class. *
|
697 |
+
* Then now JSON.pm breaks boolean classe overload features and *
|
698 |
+
* -support_by_pp if JSON::XS v3.0 or later is installed. *
|
699 |
+
* *
|
700 |
+
* JSON::true and JSON::false returned JSON::Boolean objects. *
|
701 |
+
* For workaround, they return JSON::PP::Boolean objects in this version. *
|
702 |
+
* *
|
703 |
+
* isa_ok(JSON::true, 'JSON::PP::Boolean'); *
|
704 |
+
* *
|
705 |
+
* And it discards a feature: *
|
706 |
+
* *
|
707 |
+
* ok(JSON::true eq 'true'); *
|
708 |
+
* *
|
709 |
+
* In other word, JSON::PP::Boolean overload numeric only. *
|
710 |
+
* *
|
711 |
+
* ok( JSON::true == 1 ); *
|
712 |
+
* *
|
713 |
+
**************************************************************************
|
714 |
+
|
715 |
+
************************** CAUTION ********************************
|
716 |
+
* This is 'JSON module version 2' and there are many differences *
|
717 |
+
* to version 1.xx *
|
718 |
+
* Please check your applications using old version. *
|
719 |
+
* See to 'INCOMPATIBLE CHANGES TO OLD VERSION' *
|
720 |
+
*******************************************************************
|
721 |
+
|
722 |
+
JSON (JavaScript Object Notation) is a simple data format.
|
723 |
+
See to L<http://www.json.org/> and C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>).
|
724 |
+
|
725 |
+
This module converts Perl data structures to JSON and vice versa using either
|
726 |
+
L<JSON::XS> or L<JSON::PP>.
|
727 |
+
|
728 |
+
JSON::XS is the fastest and most proper JSON module on CPAN which must be
|
729 |
+
compiled and installed in your environment.
|
730 |
+
JSON::PP is a pure-Perl module which is bundled in this distribution and
|
731 |
+
has a strong compatibility to JSON::XS.
|
732 |
+
|
733 |
+
This module try to use JSON::XS by default and fail to it, use JSON::PP instead.
|
734 |
+
So its features completely depend on JSON::XS or JSON::PP.
|
735 |
+
|
736 |
+
See to L<BACKEND MODULE DECISION>.
|
737 |
+
|
738 |
+
To distinguish the module name 'JSON' and the format type JSON,
|
739 |
+
the former is quoted by CE<lt>E<gt> (its results vary with your using media),
|
740 |
+
and the latter is left just as it is.
|
741 |
+
|
742 |
+
Module name : C<JSON>
|
743 |
+
|
744 |
+
Format type : JSON
|
745 |
+
|
746 |
+
=head2 FEATURES
|
747 |
+
|
748 |
+
=over
|
749 |
+
|
750 |
+
=item * correct unicode handling
|
751 |
+
|
752 |
+
This module (i.e. backend modules) knows how to handle Unicode, documents
|
753 |
+
how and when it does so, and even documents what "correct" means.
|
754 |
+
|
755 |
+
Even though there are limitations, this feature is available since Perl version 5.6.
|
756 |
+
|
757 |
+
JSON::XS requires Perl 5.8.2 (but works correctly in 5.8.8 or later), so in older versions
|
758 |
+
C<JSON> should call JSON::PP as the backend which can be used since Perl 5.005.
|
759 |
+
|
760 |
+
With Perl 5.8.x JSON::PP works, but from 5.8.0 to 5.8.2, because of a Perl side problem,
|
761 |
+
JSON::PP works slower in the versions. And in 5.005, the Unicode handling is not available.
|
762 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> for more information.
|
763 |
+
|
764 |
+
See also to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>
|
765 |
+
and L<JSON::XS/ENCODING/CODESET_FLAG_NOTES>.
|
766 |
+
|
767 |
+
|
768 |
+
=item * round-trip integrity
|
769 |
+
|
770 |
+
When you serialise a perl data structure using only data types supported
|
771 |
+
by JSON and Perl, the deserialised data structure is identical on the Perl
|
772 |
+
level. (e.g. the string "2.0" doesn't suddenly become "2" just because
|
773 |
+
it looks like a number). There I<are> minor exceptions to this, read the
|
774 |
+
L</MAPPING> section below to learn about those.
|
775 |
+
|
776 |
+
|
777 |
+
=item * strict checking of JSON correctness
|
778 |
+
|
779 |
+
There is no guessing, no generating of illegal JSON texts by default,
|
780 |
+
and only JSON is accepted as input by default (the latter is a security
|
781 |
+
feature).
|
782 |
+
|
783 |
+
See to L<JSON::XS/FEATURES> and L<JSON::PP/FEATURES>.
|
784 |
+
|
785 |
+
=item * fast
|
786 |
+
|
787 |
+
This module returns a JSON::XS object itself if available.
|
788 |
+
Compared to other JSON modules and other serialisers such as Storable,
|
789 |
+
JSON::XS usually compares favorably in terms of speed, too.
|
790 |
+
|
791 |
+
If not available, C<JSON> returns a JSON::PP object instead of JSON::XS and
|
792 |
+
it is very slow as pure-Perl.
|
793 |
+
|
794 |
+
=item * simple to use
|
795 |
+
|
796 |
+
This module has both a simple functional interface as well as an
|
797 |
+
object oriented interface interface.
|
798 |
+
|
799 |
+
=item * reasonably versatile output formats
|
800 |
+
|
801 |
+
You can choose between the most compact guaranteed-single-line format possible
|
802 |
+
(nice for simple line-based protocols), a pure-ASCII format (for when your transport
|
803 |
+
is not 8-bit clean, still supports the whole Unicode range), or a pretty-printed
|
804 |
+
format (for when you want to read that stuff). Or you can combine those features
|
805 |
+
in whatever way you like.
|
806 |
+
|
807 |
+
=back
|
808 |
+
|
809 |
+
=head1 FUNCTIONAL INTERFACE
|
810 |
+
|
811 |
+
Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
|
812 |
+
C<to_json> and C<from_json> are additional functions.
|
813 |
+
|
814 |
+
=head2 encode_json
|
815 |
+
|
816 |
+
$json_text = encode_json $perl_scalar
|
817 |
+
|
818 |
+
Converts the given Perl data structure to a UTF-8 encoded, binary string.
|
819 |
+
|
820 |
+
This function call is functionally identical to:
|
821 |
+
|
822 |
+
$json_text = JSON->new->utf8->encode($perl_scalar)
|
823 |
+
|
824 |
+
=head2 decode_json
|
825 |
+
|
826 |
+
$perl_scalar = decode_json $json_text
|
827 |
+
|
828 |
+
The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
|
829 |
+
to parse that as an UTF-8 encoded JSON text, returning the resulting
|
830 |
+
reference.
|
831 |
+
|
832 |
+
This function call is functionally identical to:
|
833 |
+
|
834 |
+
$perl_scalar = JSON->new->utf8->decode($json_text)
|
835 |
+
|
836 |
+
|
837 |
+
=head2 to_json
|
838 |
+
|
839 |
+
$json_text = to_json($perl_scalar)
|
840 |
+
|
841 |
+
Converts the given Perl data structure to a json string.
|
842 |
+
|
843 |
+
This function call is functionally identical to:
|
844 |
+
|
845 |
+
$json_text = JSON->new->encode($perl_scalar)
|
846 |
+
|
847 |
+
Takes a hash reference as the second.
|
848 |
+
|
849 |
+
$json_text = to_json($perl_scalar, $flag_hashref)
|
850 |
+
|
851 |
+
So,
|
852 |
+
|
853 |
+
$json_text = to_json($perl_scalar, {utf8 => 1, pretty => 1})
|
854 |
+
|
855 |
+
equivalent to:
|
856 |
+
|
857 |
+
$json_text = JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
|
858 |
+
|
859 |
+
If you want to write a modern perl code which communicates to outer world,
|
860 |
+
you should use C<encode_json> (supposed that JSON data are encoded in UTF-8).
|
861 |
+
|
862 |
+
=head2 from_json
|
863 |
+
|
864 |
+
$perl_scalar = from_json($json_text)
|
865 |
+
|
866 |
+
The opposite of C<to_json>: expects a json string and tries
|
867 |
+
to parse it, returning the resulting reference.
|
868 |
+
|
869 |
+
This function call is functionally identical to:
|
870 |
+
|
871 |
+
$perl_scalar = JSON->decode($json_text)
|
872 |
+
|
873 |
+
Takes a hash reference as the second.
|
874 |
+
|
875 |
+
$perl_scalar = from_json($json_text, $flag_hashref)
|
876 |
+
|
877 |
+
So,
|
878 |
+
|
879 |
+
$perl_scalar = from_json($json_text, {utf8 => 1})
|
880 |
+
|
881 |
+
equivalent to:
|
882 |
+
|
883 |
+
$perl_scalar = JSON->new->utf8(1)->decode($json_text)
|
884 |
+
|
885 |
+
If you want to write a modern perl code which communicates to outer world,
|
886 |
+
you should use C<decode_json> (supposed that JSON data are encoded in UTF-8).
|
887 |
+
|
888 |
+
=head2 JSON::is_bool
|
889 |
+
|
890 |
+
$is_boolean = JSON::is_bool($scalar)
|
891 |
+
|
892 |
+
Returns true if the passed scalar represents either JSON::true or
|
893 |
+
JSON::false, two constants that act like C<1> and C<0> respectively
|
894 |
+
and are also used to represent JSON C<true> and C<false> in Perl strings.
|
895 |
+
|
896 |
+
=head2 JSON::true
|
897 |
+
|
898 |
+
Returns JSON true value which is blessed object.
|
899 |
+
It C<isa> JSON::Boolean object.
|
900 |
+
|
901 |
+
=head2 JSON::false
|
902 |
+
|
903 |
+
Returns JSON false value which is blessed object.
|
904 |
+
It C<isa> JSON::Boolean object.
|
905 |
+
|
906 |
+
=head2 JSON::null
|
907 |
+
|
908 |
+
Returns C<undef>.
|
909 |
+
|
910 |
+
See L<MAPPING>, below, for more information on how JSON values are mapped to
|
911 |
+
Perl.
|
912 |
+
|
913 |
+
=head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
|
914 |
+
|
915 |
+
This section supposes that your perl version is 5.8 or later.
|
916 |
+
|
917 |
+
If you know a JSON text from an outer world - a network, a file content, and so on,
|
918 |
+
is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
|
919 |
+
with C<utf8> enable. And the decoded result will contain UNICODE characters.
|
920 |
+
|
921 |
+
# from network
|
922 |
+
my $json = JSON->new->utf8;
|
923 |
+
my $json_text = CGI->new->param( 'json_data' );
|
924 |
+
my $perl_scalar = $json->decode( $json_text );
|
925 |
+
|
926 |
+
# from file content
|
927 |
+
local $/;
|
928 |
+
open( my $fh, '<', 'json.data' );
|
929 |
+
$json_text = <$fh>;
|
930 |
+
$perl_scalar = decode_json( $json_text );
|
931 |
+
|
932 |
+
If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
|
933 |
+
|
934 |
+
use Encode;
|
935 |
+
local $/;
|
936 |
+
open( my $fh, '<', 'json.data' );
|
937 |
+
my $encoding = 'cp932';
|
938 |
+
my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
|
939 |
+
|
940 |
+
# or you can write the below code.
|
941 |
+
#
|
942 |
+
# open( my $fh, "<:encoding($encoding)", 'json.data' );
|
943 |
+
# $unicode_json_text = <$fh>;
|
944 |
+
|
945 |
+
In this case, C<$unicode_json_text> is of course UNICODE string.
|
946 |
+
So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
|
947 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable or C<from_json>.
|
948 |
+
|
949 |
+
$perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
|
950 |
+
# or
|
951 |
+
$perl_scalar = from_json( $unicode_json_text );
|
952 |
+
|
953 |
+
Or C<encode 'utf8'> and C<decode_json>:
|
954 |
+
|
955 |
+
$perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
|
956 |
+
# this way is not efficient.
|
957 |
+
|
958 |
+
And now, you want to convert your C<$perl_scalar> into JSON data and
|
959 |
+
send it to an outer world - a network or a file content, and so on.
|
960 |
+
|
961 |
+
Your data usually contains UNICODE strings and you want the converted data to be encoded
|
962 |
+
in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
|
963 |
+
|
964 |
+
print encode_json( $perl_scalar ); # to a network? file? or display?
|
965 |
+
# or
|
966 |
+
print $json->utf8->encode( $perl_scalar );
|
967 |
+
|
968 |
+
If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
|
969 |
+
for some reason, then its characters are regarded as B<latin1> for perl
|
970 |
+
(because it does not concern with your $encoding).
|
971 |
+
You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
|
972 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable or C<to_json>.
|
973 |
+
Note that the resulted text is a UNICODE string but no problem to print it.
|
974 |
+
|
975 |
+
# $perl_scalar contains $encoding encoded string values
|
976 |
+
$unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
|
977 |
+
# or
|
978 |
+
$unicode_json_text = to_json( $perl_scalar );
|
979 |
+
# $unicode_json_text consists of characters less than 0x100
|
980 |
+
print $unicode_json_text;
|
981 |
+
|
982 |
+
Or C<decode $encoding> all string values and C<encode_json>:
|
983 |
+
|
984 |
+
$perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
|
985 |
+
# ... do it to each string values, then encode_json
|
986 |
+
$json_text = encode_json( $perl_scalar );
|
987 |
+
|
988 |
+
This method is a proper way but probably not efficient.
|
989 |
+
|
990 |
+
See to L<Encode>, L<perluniintro>.
|
991 |
+
|
992 |
+
|
993 |
+
=head1 COMMON OBJECT-ORIENTED INTERFACE
|
994 |
+
|
995 |
+
=head2 new
|
996 |
+
|
997 |
+
$json = JSON->new
|
998 |
+
|
999 |
+
Returns a new C<JSON> object inherited from either JSON::XS or JSON::PP
|
1000 |
+
that can be used to de/encode JSON strings.
|
1001 |
+
|
1002 |
+
All boolean flags described below are by default I<disabled>.
|
1003 |
+
|
1004 |
+
The mutators for flags all return the JSON object again and thus calls can
|
1005 |
+
be chained:
|
1006 |
+
|
1007 |
+
my $json = JSON->new->utf8->space_after->encode({a => [1,2]})
|
1008 |
+
=> {"a": [1, 2]}
|
1009 |
+
|
1010 |
+
=head2 ascii
|
1011 |
+
|
1012 |
+
$json = $json->ascii([$enable])
|
1013 |
+
|
1014 |
+
$enabled = $json->get_ascii
|
1015 |
+
|
1016 |
+
If $enable is true (or missing), then the encode method will not generate characters outside
|
1017 |
+
the code range 0..127. Any Unicode characters outside that range will be escaped using either
|
1018 |
+
a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
|
1019 |
+
|
1020 |
+
If $enable is false, then the encode method will not escape Unicode characters unless
|
1021 |
+
required by the JSON syntax or other flags. This results in a faster and more compact format.
|
1022 |
+
|
1023 |
+
This feature depends on the used Perl version and environment.
|
1024 |
+
|
1025 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
|
1026 |
+
|
1027 |
+
JSON->new->ascii(1)->encode([chr 0x10401])
|
1028 |
+
=> ["\ud801\udc01"]
|
1029 |
+
|
1030 |
+
=head2 latin1
|
1031 |
+
|
1032 |
+
$json = $json->latin1([$enable])
|
1033 |
+
|
1034 |
+
$enabled = $json->get_latin1
|
1035 |
+
|
1036 |
+
If $enable is true (or missing), then the encode method will encode the resulting JSON
|
1037 |
+
text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
|
1038 |
+
|
1039 |
+
If $enable is false, then the encode method will not escape Unicode characters
|
1040 |
+
unless required by the JSON syntax or other flags.
|
1041 |
+
|
1042 |
+
JSON->new->latin1->encode (["\x{89}\x{abc}"]
|
1043 |
+
=> ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
|
1044 |
+
|
1045 |
+
=head2 utf8
|
1046 |
+
|
1047 |
+
$json = $json->utf8([$enable])
|
1048 |
+
|
1049 |
+
$enabled = $json->get_utf8
|
1050 |
+
|
1051 |
+
If $enable is true (or missing), then the encode method will encode the JSON result
|
1052 |
+
into UTF-8, as required by many protocols, while the decode method expects to be handled
|
1053 |
+
an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
|
1054 |
+
characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
|
1055 |
+
|
1056 |
+
In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
|
1057 |
+
encoding families, as described in RFC4627.
|
1058 |
+
|
1059 |
+
If $enable is false, then the encode method will return the JSON string as a (non-encoded)
|
1060 |
+
Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
|
1061 |
+
(e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
|
1062 |
+
|
1063 |
+
|
1064 |
+
Example, output UTF-16BE-encoded JSON:
|
1065 |
+
|
1066 |
+
use Encode;
|
1067 |
+
$jsontext = encode "UTF-16BE", JSON::XS->new->encode ($object);
|
1068 |
+
|
1069 |
+
Example, decode UTF-32LE-encoded JSON:
|
1070 |
+
|
1071 |
+
use Encode;
|
1072 |
+
$object = JSON::XS->new->decode (decode "UTF-32LE", $jsontext);
|
1073 |
+
|
1074 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
|
1075 |
+
|
1076 |
+
|
1077 |
+
=head2 pretty
|
1078 |
+
|
1079 |
+
$json = $json->pretty([$enable])
|
1080 |
+
|
1081 |
+
This enables (or disables) all of the C<indent>, C<space_before> and
|
1082 |
+
C<space_after> (and in the future possibly more) flags in one call to
|
1083 |
+
generate the most readable (or most compact) form possible.
|
1084 |
+
|
1085 |
+
Equivalent to:
|
1086 |
+
|
1087 |
+
$json->indent->space_before->space_after
|
1088 |
+
|
1089 |
+
The indent space length is three and JSON::XS cannot change the indent
|
1090 |
+
space length.
|
1091 |
+
|
1092 |
+
=head2 indent
|
1093 |
+
|
1094 |
+
$json = $json->indent([$enable])
|
1095 |
+
|
1096 |
+
$enabled = $json->get_indent
|
1097 |
+
|
1098 |
+
If C<$enable> is true (or missing), then the C<encode> method will use a multiline
|
1099 |
+
format as output, putting every array member or object/hash key-value pair
|
1100 |
+
into its own line, identifying them properly.
|
1101 |
+
|
1102 |
+
If C<$enable> is false, no newlines or indenting will be produced, and the
|
1103 |
+
resulting JSON text is guaranteed not to contain any C<newlines>.
|
1104 |
+
|
1105 |
+
This setting has no effect when decoding JSON texts.
|
1106 |
+
|
1107 |
+
The indent space length is three.
|
1108 |
+
With JSON::PP, you can also access C<indent_length> to change indent space length.
|
1109 |
+
|
1110 |
+
|
1111 |
+
=head2 space_before
|
1112 |
+
|
1113 |
+
$json = $json->space_before([$enable])
|
1114 |
+
|
1115 |
+
$enabled = $json->get_space_before
|
1116 |
+
|
1117 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1118 |
+
optional space before the C<:> separating keys from values in JSON objects.
|
1119 |
+
|
1120 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1121 |
+
space at those places.
|
1122 |
+
|
1123 |
+
This setting has no effect when decoding JSON texts.
|
1124 |
+
|
1125 |
+
Example, space_before enabled, space_after and indent disabled:
|
1126 |
+
|
1127 |
+
{"key" :"value"}
|
1128 |
+
|
1129 |
+
|
1130 |
+
=head2 space_after
|
1131 |
+
|
1132 |
+
$json = $json->space_after([$enable])
|
1133 |
+
|
1134 |
+
$enabled = $json->get_space_after
|
1135 |
+
|
1136 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1137 |
+
optional space after the C<:> separating keys from values in JSON objects
|
1138 |
+
and extra whitespace after the C<,> separating key-value pairs and array
|
1139 |
+
members.
|
1140 |
+
|
1141 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1142 |
+
space at those places.
|
1143 |
+
|
1144 |
+
This setting has no effect when decoding JSON texts.
|
1145 |
+
|
1146 |
+
Example, space_before and indent disabled, space_after enabled:
|
1147 |
+
|
1148 |
+
{"key": "value"}
|
1149 |
+
|
1150 |
+
|
1151 |
+
=head2 relaxed
|
1152 |
+
|
1153 |
+
$json = $json->relaxed([$enable])
|
1154 |
+
|
1155 |
+
$enabled = $json->get_relaxed
|
1156 |
+
|
1157 |
+
If C<$enable> is true (or missing), then C<decode> will accept some
|
1158 |
+
extensions to normal JSON syntax (see below). C<encode> will not be
|
1159 |
+
affected in anyway. I<Be aware that this option makes you accept invalid
|
1160 |
+
JSON texts as if they were valid!>. I suggest only to use this option to
|
1161 |
+
parse application-specific files written by humans (configuration files,
|
1162 |
+
resource files etc.)
|
1163 |
+
|
1164 |
+
If C<$enable> is false (the default), then C<decode> will only accept
|
1165 |
+
valid JSON texts.
|
1166 |
+
|
1167 |
+
Currently accepted extensions are:
|
1168 |
+
|
1169 |
+
=over 4
|
1170 |
+
|
1171 |
+
=item * list items can have an end-comma
|
1172 |
+
|
1173 |
+
JSON I<separates> array elements and key-value pairs with commas. This
|
1174 |
+
can be annoying if you write JSON texts manually and want to be able to
|
1175 |
+
quickly append elements, so this extension accepts comma at the end of
|
1176 |
+
such items not just between them:
|
1177 |
+
|
1178 |
+
[
|
1179 |
+
1,
|
1180 |
+
2, <- this comma not normally allowed
|
1181 |
+
]
|
1182 |
+
{
|
1183 |
+
"k1": "v1",
|
1184 |
+
"k2": "v2", <- this comma not normally allowed
|
1185 |
+
}
|
1186 |
+
|
1187 |
+
=item * shell-style '#'-comments
|
1188 |
+
|
1189 |
+
Whenever JSON allows whitespace, shell-style comments are additionally
|
1190 |
+
allowed. They are terminated by the first carriage-return or line-feed
|
1191 |
+
character, after which more white-space and comments are allowed.
|
1192 |
+
|
1193 |
+
[
|
1194 |
+
1, # this comment not allowed in JSON
|
1195 |
+
# neither this one...
|
1196 |
+
]
|
1197 |
+
|
1198 |
+
=back
|
1199 |
+
|
1200 |
+
|
1201 |
+
=head2 canonical
|
1202 |
+
|
1203 |
+
$json = $json->canonical([$enable])
|
1204 |
+
|
1205 |
+
$enabled = $json->get_canonical
|
1206 |
+
|
1207 |
+
If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
|
1208 |
+
by sorting their keys. This is adding a comparatively high overhead.
|
1209 |
+
|
1210 |
+
If C<$enable> is false, then the C<encode> method will output key-value
|
1211 |
+
pairs in the order Perl stores them (which will likely change between runs
|
1212 |
+
of the same script).
|
1213 |
+
|
1214 |
+
This option is useful if you want the same data structure to be encoded as
|
1215 |
+
the same JSON text (given the same overall settings). If it is disabled,
|
1216 |
+
the same hash might be encoded differently even if contains the same data,
|
1217 |
+
as key-value pairs have no inherent ordering in Perl.
|
1218 |
+
|
1219 |
+
This setting has no effect when decoding JSON texts.
|
1220 |
+
|
1221 |
+
=head2 allow_nonref
|
1222 |
+
|
1223 |
+
$json = $json->allow_nonref([$enable])
|
1224 |
+
|
1225 |
+
$enabled = $json->get_allow_nonref
|
1226 |
+
|
1227 |
+
If C<$enable> is true (or missing), then the C<encode> method can convert a
|
1228 |
+
non-reference into its corresponding string, number or null JSON value,
|
1229 |
+
which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
|
1230 |
+
values instead of croaking.
|
1231 |
+
|
1232 |
+
If C<$enable> is false, then the C<encode> method will croak if it isn't
|
1233 |
+
passed an arrayref or hashref, as JSON texts must either be an object
|
1234 |
+
or array. Likewise, C<decode> will croak if given something that is not a
|
1235 |
+
JSON object or array.
|
1236 |
+
|
1237 |
+
JSON->new->allow_nonref->encode ("Hello, World!")
|
1238 |
+
=> "Hello, World!"
|
1239 |
+
|
1240 |
+
=head2 allow_unknown
|
1241 |
+
|
1242 |
+
$json = $json->allow_unknown ([$enable])
|
1243 |
+
|
1244 |
+
$enabled = $json->get_allow_unknown
|
1245 |
+
|
1246 |
+
If $enable is true (or missing), then "encode" will *not* throw an
|
1247 |
+
exception when it encounters values it cannot represent in JSON (for
|
1248 |
+
example, filehandles) but instead will encode a JSON "null" value.
|
1249 |
+
Note that blessed objects are not included here and are handled
|
1250 |
+
separately by c<allow_nonref>.
|
1251 |
+
|
1252 |
+
If $enable is false (the default), then "encode" will throw an
|
1253 |
+
exception when it encounters anything it cannot encode as JSON.
|
1254 |
+
|
1255 |
+
This option does not affect "decode" in any way, and it is
|
1256 |
+
recommended to leave it off unless you know your communications
|
1257 |
+
partner.
|
1258 |
+
|
1259 |
+
=head2 allow_blessed
|
1260 |
+
|
1261 |
+
$json = $json->allow_blessed([$enable])
|
1262 |
+
|
1263 |
+
$enabled = $json->get_allow_blessed
|
1264 |
+
|
1265 |
+
If C<$enable> is true (or missing), then the C<encode> method will not
|
1266 |
+
barf when it encounters a blessed reference. Instead, the value of the
|
1267 |
+
B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
|
1268 |
+
disabled or no C<TO_JSON> method found) or a representation of the
|
1269 |
+
object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
|
1270 |
+
encoded. Has no effect on C<decode>.
|
1271 |
+
|
1272 |
+
If C<$enable> is false (the default), then C<encode> will throw an
|
1273 |
+
exception when it encounters a blessed object.
|
1274 |
+
|
1275 |
+
|
1276 |
+
=head2 convert_blessed
|
1277 |
+
|
1278 |
+
$json = $json->convert_blessed([$enable])
|
1279 |
+
|
1280 |
+
$enabled = $json->get_convert_blessed
|
1281 |
+
|
1282 |
+
If C<$enable> is true (or missing), then C<encode>, upon encountering a
|
1283 |
+
blessed object, will check for the availability of the C<TO_JSON> method
|
1284 |
+
on the object's class. If found, it will be called in scalar context
|
1285 |
+
and the resulting scalar will be encoded instead of the object. If no
|
1286 |
+
C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
|
1287 |
+
to do.
|
1288 |
+
|
1289 |
+
The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
|
1290 |
+
returns other blessed objects, those will be handled in the same
|
1291 |
+
way. C<TO_JSON> must take care of not causing an endless recursion cycle
|
1292 |
+
(== crash) in this case. The name of C<TO_JSON> was chosen because other
|
1293 |
+
methods called by the Perl core (== not by the user of the object) are
|
1294 |
+
usually in upper case letters and to avoid collisions with the C<to_json>
|
1295 |
+
function or method.
|
1296 |
+
|
1297 |
+
This setting does not yet influence C<decode> in any way.
|
1298 |
+
|
1299 |
+
If C<$enable> is false, then the C<allow_blessed> setting will decide what
|
1300 |
+
to do when a blessed object is found.
|
1301 |
+
|
1302 |
+
=over
|
1303 |
+
|
1304 |
+
=item convert_blessed_universally mode
|
1305 |
+
|
1306 |
+
If use C<JSON> with C<-convert_blessed_universally>, the C<UNIVERSAL::TO_JSON>
|
1307 |
+
subroutine is defined as the below code:
|
1308 |
+
|
1309 |
+
*UNIVERSAL::TO_JSON = sub {
|
1310 |
+
my $b_obj = B::svref_2object( $_[0] );
|
1311 |
+
return $b_obj->isa('B::HV') ? { %{ $_[0] } }
|
1312 |
+
: $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
|
1313 |
+
: undef
|
1314 |
+
;
|
1315 |
+
}
|
1316 |
+
|
1317 |
+
This will cause that C<encode> method converts simple blessed objects into
|
1318 |
+
JSON objects as non-blessed object.
|
1319 |
+
|
1320 |
+
JSON -convert_blessed_universally;
|
1321 |
+
$json->allow_blessed->convert_blessed->encode( $blessed_object )
|
1322 |
+
|
1323 |
+
This feature is experimental and may be removed in the future.
|
1324 |
+
|
1325 |
+
=back
|
1326 |
+
|
1327 |
+
=head2 filter_json_object
|
1328 |
+
|
1329 |
+
$json = $json->filter_json_object([$coderef])
|
1330 |
+
|
1331 |
+
When C<$coderef> is specified, it will be called from C<decode> each
|
1332 |
+
time it decodes a JSON object. The only argument passed to the coderef
|
1333 |
+
is a reference to the newly-created hash. If the code references returns
|
1334 |
+
a single scalar (which need not be a reference), this value
|
1335 |
+
(i.e. a copy of that scalar to avoid aliasing) is inserted into the
|
1336 |
+
deserialised data structure. If it returns an empty list
|
1337 |
+
(NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
|
1338 |
+
hash will be inserted. This setting can slow down decoding considerably.
|
1339 |
+
|
1340 |
+
When C<$coderef> is omitted or undefined, any existing callback will
|
1341 |
+
be removed and C<decode> will not change the deserialised hash in any
|
1342 |
+
way.
|
1343 |
+
|
1344 |
+
Example, convert all JSON objects into the integer 5:
|
1345 |
+
|
1346 |
+
my $js = JSON->new->filter_json_object (sub { 5 });
|
1347 |
+
# returns [5]
|
1348 |
+
$js->decode ('[{}]'); # the given subroutine takes a hash reference.
|
1349 |
+
# throw an exception because allow_nonref is not enabled
|
1350 |
+
# so a lone 5 is not allowed.
|
1351 |
+
$js->decode ('{"a":1, "b":2}');
|
1352 |
+
|
1353 |
+
|
1354 |
+
=head2 filter_json_single_key_object
|
1355 |
+
|
1356 |
+
$json = $json->filter_json_single_key_object($key [=> $coderef])
|
1357 |
+
|
1358 |
+
Works remotely similar to C<filter_json_object>, but is only called for
|
1359 |
+
JSON objects having a single key named C<$key>.
|
1360 |
+
|
1361 |
+
This C<$coderef> is called before the one specified via
|
1362 |
+
C<filter_json_object>, if any. It gets passed the single value in the JSON
|
1363 |
+
object. If it returns a single value, it will be inserted into the data
|
1364 |
+
structure. If it returns nothing (not even C<undef> but the empty list),
|
1365 |
+
the callback from C<filter_json_object> will be called next, as if no
|
1366 |
+
single-key callback were specified.
|
1367 |
+
|
1368 |
+
If C<$coderef> is omitted or undefined, the corresponding callback will be
|
1369 |
+
disabled. There can only ever be one callback for a given key.
|
1370 |
+
|
1371 |
+
As this callback gets called less often then the C<filter_json_object>
|
1372 |
+
one, decoding speed will not usually suffer as much. Therefore, single-key
|
1373 |
+
objects make excellent targets to serialise Perl objects into, especially
|
1374 |
+
as single-key JSON objects are as close to the type-tagged value concept
|
1375 |
+
as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
|
1376 |
+
support this in any way, so you need to make sure your data never looks
|
1377 |
+
like a serialised Perl hash.
|
1378 |
+
|
1379 |
+
Typical names for the single object key are C<__class_whatever__>, or
|
1380 |
+
C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
|
1381 |
+
things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
|
1382 |
+
with real hashes.
|
1383 |
+
|
1384 |
+
Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
|
1385 |
+
into the corresponding C<< $WIDGET{<id>} >> object:
|
1386 |
+
|
1387 |
+
# return whatever is in $WIDGET{5}:
|
1388 |
+
JSON
|
1389 |
+
->new
|
1390 |
+
->filter_json_single_key_object (__widget__ => sub {
|
1391 |
+
$WIDGET{ $_[0] }
|
1392 |
+
})
|
1393 |
+
->decode ('{"__widget__": 5')
|
1394 |
+
|
1395 |
+
# this can be used with a TO_JSON method in some "widget" class
|
1396 |
+
# for serialisation to json:
|
1397 |
+
sub WidgetBase::TO_JSON {
|
1398 |
+
my ($self) = @_;
|
1399 |
+
|
1400 |
+
unless ($self->{id}) {
|
1401 |
+
$self->{id} = ..get..some..id..;
|
1402 |
+
$WIDGET{$self->{id}} = $self;
|
1403 |
+
}
|
1404 |
+
|
1405 |
+
{ __widget__ => $self->{id} }
|
1406 |
+
}
|
1407 |
+
|
1408 |
+
|
1409 |
+
=head2 shrink
|
1410 |
+
|
1411 |
+
$json = $json->shrink([$enable])
|
1412 |
+
|
1413 |
+
$enabled = $json->get_shrink
|
1414 |
+
|
1415 |
+
With JSON::XS, this flag resizes strings generated by either
|
1416 |
+
C<encode> or C<decode> to their minimum size possible. This can save
|
1417 |
+
memory when your JSON texts are either very very long or you have many
|
1418 |
+
short strings. It will also try to downgrade any strings to octet-form
|
1419 |
+
if possible: perl stores strings internally either in an encoding called
|
1420 |
+
UTF-X or in octet-form. The latter cannot store everything but uses less
|
1421 |
+
space in general (and some buggy Perl or C code might even rely on that
|
1422 |
+
internal representation being used).
|
1423 |
+
|
1424 |
+
With JSON::PP, it is noop about resizing strings but tries
|
1425 |
+
C<utf8::downgrade> to the returned string by C<encode>. See to L<utf8>.
|
1426 |
+
|
1427 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE> and L<JSON::PP/METHODS>.
|
1428 |
+
|
1429 |
+
=head2 max_depth
|
1430 |
+
|
1431 |
+
$json = $json->max_depth([$maximum_nesting_depth])
|
1432 |
+
|
1433 |
+
$max_depth = $json->get_max_depth
|
1434 |
+
|
1435 |
+
Sets the maximum nesting level (default C<512>) accepted while encoding
|
1436 |
+
or decoding. If a higher nesting level is detected in JSON text or a Perl
|
1437 |
+
data structure, then the encoder and decoder will stop and croak at that
|
1438 |
+
point.
|
1439 |
+
|
1440 |
+
Nesting level is defined by number of hash- or arrayrefs that the encoder
|
1441 |
+
needs to traverse to reach a given point or the number of C<{> or C<[>
|
1442 |
+
characters without their matching closing parenthesis crossed to reach a
|
1443 |
+
given character in a string.
|
1444 |
+
|
1445 |
+
If no argument is given, the highest possible setting will be used, which
|
1446 |
+
is rarely useful.
|
1447 |
+
|
1448 |
+
Note that nesting is implemented by recursion in C. The default value has
|
1449 |
+
been chosen to be as large as typical operating systems allow without
|
1450 |
+
crashing. (JSON::XS)
|
1451 |
+
|
1452 |
+
With JSON::PP as the backend, when a large value (100 or more) was set and
|
1453 |
+
it de/encodes a deep nested object/text, it may raise a warning
|
1454 |
+
'Deep recursion on subroutine' at the perl runtime phase.
|
1455 |
+
|
1456 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
|
1457 |
+
|
1458 |
+
=head2 max_size
|
1459 |
+
|
1460 |
+
$json = $json->max_size([$maximum_string_size])
|
1461 |
+
|
1462 |
+
$max_size = $json->get_max_size
|
1463 |
+
|
1464 |
+
Set the maximum length a JSON text may have (in bytes) where decoding is
|
1465 |
+
being attempted. The default is C<0>, meaning no limit. When C<decode>
|
1466 |
+
is called on a string that is longer then this many bytes, it will not
|
1467 |
+
attempt to decode the string but throw an exception. This setting has no
|
1468 |
+
effect on C<encode> (yet).
|
1469 |
+
|
1470 |
+
If no argument is given, the limit check will be deactivated (same as when
|
1471 |
+
C<0> is specified).
|
1472 |
+
|
1473 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS>, below, for more info on why this is useful.
|
1474 |
+
|
1475 |
+
=head2 encode
|
1476 |
+
|
1477 |
+
$json_text = $json->encode($perl_scalar)
|
1478 |
+
|
1479 |
+
Converts the given Perl data structure (a simple scalar or a reference
|
1480 |
+
to a hash or array) to its JSON representation. Simple scalars will be
|
1481 |
+
converted into JSON string or number sequences, while references to arrays
|
1482 |
+
become JSON arrays and references to hashes become JSON objects. Undefined
|
1483 |
+
Perl values (e.g. C<undef>) become JSON C<null> values.
|
1484 |
+
References to the integers C<0> and C<1> are converted into C<true> and C<false>.
|
1485 |
+
|
1486 |
+
=head2 decode
|
1487 |
+
|
1488 |
+
$perl_scalar = $json->decode($json_text)
|
1489 |
+
|
1490 |
+
The opposite of C<encode>: expects a JSON text and tries to parse it,
|
1491 |
+
returning the resulting simple scalar or reference. Croaks on error.
|
1492 |
+
|
1493 |
+
JSON numbers and strings become simple Perl scalars. JSON arrays become
|
1494 |
+
Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
|
1495 |
+
C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
|
1496 |
+
C<null> becomes C<undef>.
|
1497 |
+
|
1498 |
+
=head2 decode_prefix
|
1499 |
+
|
1500 |
+
($perl_scalar, $characters) = $json->decode_prefix($json_text)
|
1501 |
+
|
1502 |
+
This works like the C<decode> method, but instead of raising an exception
|
1503 |
+
when there is trailing garbage after the first JSON object, it will
|
1504 |
+
silently stop parsing there and return the number of characters consumed
|
1505 |
+
so far.
|
1506 |
+
|
1507 |
+
JSON->new->decode_prefix ("[1] the tail")
|
1508 |
+
=> ([], 3)
|
1509 |
+
|
1510 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
|
1511 |
+
|
1512 |
+
=head2 property
|
1513 |
+
|
1514 |
+
$boolean = $json->property($property_name)
|
1515 |
+
|
1516 |
+
Returns a boolean value about above some properties.
|
1517 |
+
|
1518 |
+
The available properties are C<ascii>, C<latin1>, C<utf8>,
|
1519 |
+
C<indent>,C<space_before>, C<space_after>, C<relaxed>, C<canonical>,
|
1520 |
+
C<allow_nonref>, C<allow_unknown>, C<allow_blessed>, C<convert_blessed>,
|
1521 |
+
C<shrink>, C<max_depth> and C<max_size>.
|
1522 |
+
|
1523 |
+
$boolean = $json->property('utf8');
|
1524 |
+
=> 0
|
1525 |
+
$json->utf8;
|
1526 |
+
$boolean = $json->property('utf8');
|
1527 |
+
=> 1
|
1528 |
+
|
1529 |
+
Sets the property with a given boolean value.
|
1530 |
+
|
1531 |
+
$json = $json->property($property_name => $boolean);
|
1532 |
+
|
1533 |
+
With no argument, it returns all the above properties as a hash reference.
|
1534 |
+
|
1535 |
+
$flag_hashref = $json->property();
|
1536 |
+
|
1537 |
+
=head1 INCREMENTAL PARSING
|
1538 |
+
|
1539 |
+
Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
|
1540 |
+
|
1541 |
+
In some cases, there is the need for incremental parsing of JSON texts.
|
1542 |
+
This module does allow you to parse a JSON stream incrementally.
|
1543 |
+
It does so by accumulating text until it has a full JSON object, which
|
1544 |
+
it then can decode. This process is similar to using C<decode_prefix>
|
1545 |
+
to see if a full JSON object is available, but is much more efficient
|
1546 |
+
(and can be implemented with a minimum of method calls).
|
1547 |
+
|
1548 |
+
The backend module will only attempt to parse the JSON text once it is sure it
|
1549 |
+
has enough text to get a decisive result, using a very simple but
|
1550 |
+
truly incremental parser. This means that it sometimes won't stop as
|
1551 |
+
early as the full parser, for example, it doesn't detect parenthesis
|
1552 |
+
mismatches. The only thing it guarantees is that it starts decoding as
|
1553 |
+
soon as a syntactically valid JSON text has been seen. This means you need
|
1554 |
+
to set resource limits (e.g. C<max_size>) to ensure the parser will stop
|
1555 |
+
parsing in the presence if syntax errors.
|
1556 |
+
|
1557 |
+
The following methods implement this incremental parser.
|
1558 |
+
|
1559 |
+
=head2 incr_parse
|
1560 |
+
|
1561 |
+
$json->incr_parse( [$string] ) # void context
|
1562 |
+
|
1563 |
+
$obj_or_undef = $json->incr_parse( [$string] ) # scalar context
|
1564 |
+
|
1565 |
+
@obj_or_empty = $json->incr_parse( [$string] ) # list context
|
1566 |
+
|
1567 |
+
This is the central parsing function. It can both append new text and
|
1568 |
+
extract objects from the stream accumulated so far (both of these
|
1569 |
+
functions are optional).
|
1570 |
+
|
1571 |
+
If C<$string> is given, then this string is appended to the already
|
1572 |
+
existing JSON fragment stored in the C<$json> object.
|
1573 |
+
|
1574 |
+
After that, if the function is called in void context, it will simply
|
1575 |
+
return without doing anything further. This can be used to add more text
|
1576 |
+
in as many chunks as you want.
|
1577 |
+
|
1578 |
+
If the method is called in scalar context, then it will try to extract
|
1579 |
+
exactly I<one> JSON object. If that is successful, it will return this
|
1580 |
+
object, otherwise it will return C<undef>. If there is a parse error,
|
1581 |
+
this method will croak just as C<decode> would do (one can then use
|
1582 |
+
C<incr_skip> to skip the erroneous part). This is the most common way of
|
1583 |
+
using the method.
|
1584 |
+
|
1585 |
+
And finally, in list context, it will try to extract as many objects
|
1586 |
+
from the stream as it can find and return them, or the empty list
|
1587 |
+
otherwise. For this to work, there must be no separators between the JSON
|
1588 |
+
objects or arrays, instead they must be concatenated back-to-back. If
|
1589 |
+
an error occurs, an exception will be raised as in the scalar context
|
1590 |
+
case. Note that in this case, any previously-parsed JSON texts will be
|
1591 |
+
lost.
|
1592 |
+
|
1593 |
+
Example: Parse some JSON arrays/objects in a given string and return them.
|
1594 |
+
|
1595 |
+
my @objs = JSON->new->incr_parse ("[5][7][1,2]");
|
1596 |
+
|
1597 |
+
=head2 incr_text
|
1598 |
+
|
1599 |
+
$lvalue_string = $json->incr_text
|
1600 |
+
|
1601 |
+
This method returns the currently stored JSON fragment as an lvalue, that
|
1602 |
+
is, you can manipulate it. This I<only> works when a preceding call to
|
1603 |
+
C<incr_parse> in I<scalar context> successfully returned an object. Under
|
1604 |
+
all other circumstances you must not call this function (I mean it.
|
1605 |
+
although in simple tests it might actually work, it I<will> fail under
|
1606 |
+
real world conditions). As a special exception, you can also call this
|
1607 |
+
method before having parsed anything.
|
1608 |
+
|
1609 |
+
This function is useful in two cases: a) finding the trailing text after a
|
1610 |
+
JSON object or b) parsing multiple JSON objects separated by non-JSON text
|
1611 |
+
(such as commas).
|
1612 |
+
|
1613 |
+
$json->incr_text =~ s/\s*,\s*//;
|
1614 |
+
|
1615 |
+
In Perl 5.005, C<lvalue> attribute is not available.
|
1616 |
+
You must write codes like the below:
|
1617 |
+
|
1618 |
+
$string = $json->incr_text;
|
1619 |
+
$string =~ s/\s*,\s*//;
|
1620 |
+
$json->incr_text( $string );
|
1621 |
+
|
1622 |
+
=head2 incr_skip
|
1623 |
+
|
1624 |
+
$json->incr_skip
|
1625 |
+
|
1626 |
+
This will reset the state of the incremental parser and will remove the
|
1627 |
+
parsed text from the input buffer. This is useful after C<incr_parse>
|
1628 |
+
died, in which case the input buffer and incremental parser state is left
|
1629 |
+
unchanged, to skip the text parsed so far and to reset the parse state.
|
1630 |
+
|
1631 |
+
=head2 incr_reset
|
1632 |
+
|
1633 |
+
$json->incr_reset
|
1634 |
+
|
1635 |
+
This completely resets the incremental parser, that is, after this call,
|
1636 |
+
it will be as if the parser had never parsed anything.
|
1637 |
+
|
1638 |
+
This is useful if you want to repeatedly parse JSON objects and want to
|
1639 |
+
ignore any trailing data, which means you have to reset the parser after
|
1640 |
+
each successful decode.
|
1641 |
+
|
1642 |
+
See to L<JSON::XS/INCREMENTAL PARSING> for examples.
|
1643 |
+
|
1644 |
+
|
1645 |
+
=head1 JSON::PP SUPPORT METHODS
|
1646 |
+
|
1647 |
+
The below methods are JSON::PP own methods, so when C<JSON> works
|
1648 |
+
with JSON::PP (i.e. the created object is a JSON::PP object), available.
|
1649 |
+
See to L<JSON::PP/JSON::PP OWN METHODS> in detail.
|
1650 |
+
|
1651 |
+
If you use C<JSON> with additional C<-support_by_pp>, some methods
|
1652 |
+
are available even with JSON::XS. See to L<USE PP FEATURES EVEN THOUGH XS BACKEND>.
|
1653 |
+
|
1654 |
+
BEING { $ENV{PERL_JSON_BACKEND} = 'JSON::XS' }
|
1655 |
+
|
1656 |
+
use JSON -support_by_pp;
|
1657 |
+
|
1658 |
+
my $json = JSON->new;
|
1659 |
+
$json->allow_nonref->escape_slash->encode("/");
|
1660 |
+
|
1661 |
+
# functional interfaces too.
|
1662 |
+
print to_json(["/"], {escape_slash => 1});
|
1663 |
+
print from_json('["foo"]', {utf8 => 1});
|
1664 |
+
|
1665 |
+
If you do not want to all functions but C<-support_by_pp>,
|
1666 |
+
use C<-no_export>.
|
1667 |
+
|
1668 |
+
use JSON -support_by_pp, -no_export;
|
1669 |
+
# functional interfaces are not exported.
|
1670 |
+
|
1671 |
+
=head2 allow_singlequote
|
1672 |
+
|
1673 |
+
$json = $json->allow_singlequote([$enable])
|
1674 |
+
|
1675 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
1676 |
+
any JSON strings quoted by single quotations that are invalid JSON
|
1677 |
+
format.
|
1678 |
+
|
1679 |
+
$json->allow_singlequote->decode({"foo":'bar'});
|
1680 |
+
$json->allow_singlequote->decode({'foo':"bar"});
|
1681 |
+
$json->allow_singlequote->decode({'foo':'bar'});
|
1682 |
+
|
1683 |
+
As same as the C<relaxed> option, this option may be used to parse
|
1684 |
+
application-specific files written by humans.
|
1685 |
+
|
1686 |
+
=head2 allow_barekey
|
1687 |
+
|
1688 |
+
$json = $json->allow_barekey([$enable])
|
1689 |
+
|
1690 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
1691 |
+
bare keys of JSON object that are invalid JSON format.
|
1692 |
+
|
1693 |
+
As same as the C<relaxed> option, this option may be used to parse
|
1694 |
+
application-specific files written by humans.
|
1695 |
+
|
1696 |
+
$json->allow_barekey->decode('{foo:"bar"}');
|
1697 |
+
|
1698 |
+
=head2 allow_bignum
|
1699 |
+
|
1700 |
+
$json = $json->allow_bignum([$enable])
|
1701 |
+
|
1702 |
+
If C<$enable> is true (or missing), then C<decode> will convert
|
1703 |
+
the big integer Perl cannot handle as integer into a L<Math::BigInt>
|
1704 |
+
object and convert a floating number (any) into a L<Math::BigFloat>.
|
1705 |
+
|
1706 |
+
On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
1707 |
+
objects into JSON numbers with C<allow_blessed> enable.
|
1708 |
+
|
1709 |
+
$json->allow_nonref->allow_blessed->allow_bignum;
|
1710 |
+
$bigfloat = $json->decode('2.000000000000000000000000001');
|
1711 |
+
print $json->encode($bigfloat);
|
1712 |
+
# => 2.000000000000000000000000001
|
1713 |
+
|
1714 |
+
See to L<MAPPING> about the conversion of JSON number.
|
1715 |
+
|
1716 |
+
=head2 loose
|
1717 |
+
|
1718 |
+
$json = $json->loose([$enable])
|
1719 |
+
|
1720 |
+
The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
|
1721 |
+
and the module doesn't allow to C<decode> to these (except for \x2f).
|
1722 |
+
If C<$enable> is true (or missing), then C<decode> will accept these
|
1723 |
+
unescaped strings.
|
1724 |
+
|
1725 |
+
$json->loose->decode(qq|["abc
|
1726 |
+
def"]|);
|
1727 |
+
|
1728 |
+
See to L<JSON::PP/JSON::PP OWN METHODS>.
|
1729 |
+
|
1730 |
+
=head2 escape_slash
|
1731 |
+
|
1732 |
+
$json = $json->escape_slash([$enable])
|
1733 |
+
|
1734 |
+
According to JSON Grammar, I<slash> (U+002F) is escaped. But by default
|
1735 |
+
JSON backend modules encode strings without escaping slash.
|
1736 |
+
|
1737 |
+
If C<$enable> is true (or missing), then C<encode> will escape slashes.
|
1738 |
+
|
1739 |
+
=head2 indent_length
|
1740 |
+
|
1741 |
+
$json = $json->indent_length($length)
|
1742 |
+
|
1743 |
+
With JSON::XS, The indent space length is 3 and cannot be changed.
|
1744 |
+
With JSON::PP, it sets the indent space length with the given $length.
|
1745 |
+
The default is 3. The acceptable range is 0 to 15.
|
1746 |
+
|
1747 |
+
=head2 sort_by
|
1748 |
+
|
1749 |
+
$json = $json->sort_by($function_name)
|
1750 |
+
$json = $json->sort_by($subroutine_ref)
|
1751 |
+
|
1752 |
+
If $function_name or $subroutine_ref are set, its sort routine are used.
|
1753 |
+
|
1754 |
+
$js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
|
1755 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
1756 |
+
|
1757 |
+
$js = $pc->sort_by('own_sort')->encode($obj);
|
1758 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
1759 |
+
|
1760 |
+
sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
|
1761 |
+
|
1762 |
+
As the sorting routine runs in the JSON::PP scope, the given
|
1763 |
+
subroutine name and the special variables C<$a>, C<$b> will begin
|
1764 |
+
with 'JSON::PP::'.
|
1765 |
+
|
1766 |
+
If $integer is set, then the effect is same as C<canonical> on.
|
1767 |
+
|
1768 |
+
See to L<JSON::PP/JSON::PP OWN METHODS>.
|
1769 |
+
|
1770 |
+
=head1 MAPPING
|
1771 |
+
|
1772 |
+
This section is copied from JSON::XS and modified to C<JSON>.
|
1773 |
+
JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
|
1774 |
+
|
1775 |
+
See to L<JSON::XS/MAPPING>.
|
1776 |
+
|
1777 |
+
=head2 JSON -> PERL
|
1778 |
+
|
1779 |
+
=over 4
|
1780 |
+
|
1781 |
+
=item object
|
1782 |
+
|
1783 |
+
A JSON object becomes a reference to a hash in Perl. No ordering of object
|
1784 |
+
keys is preserved (JSON does not preserver object key ordering itself).
|
1785 |
+
|
1786 |
+
=item array
|
1787 |
+
|
1788 |
+
A JSON array becomes a reference to an array in Perl.
|
1789 |
+
|
1790 |
+
=item string
|
1791 |
+
|
1792 |
+
A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
|
1793 |
+
are represented by the same codepoints in the Perl string, so no manual
|
1794 |
+
decoding is necessary.
|
1795 |
+
|
1796 |
+
=item number
|
1797 |
+
|
1798 |
+
A JSON number becomes either an integer, numeric (floating point) or
|
1799 |
+
string scalar in perl, depending on its range and any fractional parts. On
|
1800 |
+
the Perl level, there is no difference between those as Perl handles all
|
1801 |
+
the conversion details, but an integer may take slightly less memory and
|
1802 |
+
might represent more values exactly than floating point numbers.
|
1803 |
+
|
1804 |
+
If the number consists of digits only, C<JSON> will try to represent
|
1805 |
+
it as an integer value. If that fails, it will try to represent it as
|
1806 |
+
a numeric (floating point) value if that is possible without loss of
|
1807 |
+
precision. Otherwise it will preserve the number as a string value (in
|
1808 |
+
which case you lose roundtripping ability, as the JSON number will be
|
1809 |
+
re-encoded to a JSON string).
|
1810 |
+
|
1811 |
+
Numbers containing a fractional or exponential part will always be
|
1812 |
+
represented as numeric (floating point) values, possibly at a loss of
|
1813 |
+
precision (in which case you might lose perfect roundtripping ability, but
|
1814 |
+
the JSON number will still be re-encoded as a JSON number).
|
1815 |
+
|
1816 |
+
Note that precision is not accuracy - binary floating point values cannot
|
1817 |
+
represent most decimal fractions exactly, and when converting from and to
|
1818 |
+
floating point, C<JSON> only guarantees precision up to but not including
|
1819 |
+
the least significant bit.
|
1820 |
+
|
1821 |
+
If the backend is JSON::PP and C<allow_bignum> is enable, the big integers
|
1822 |
+
and the numeric can be optionally converted into L<Math::BigInt> and
|
1823 |
+
L<Math::BigFloat> objects.
|
1824 |
+
|
1825 |
+
=item true, false
|
1826 |
+
|
1827 |
+
These JSON atoms become C<JSON::true> and C<JSON::false>,
|
1828 |
+
respectively. They are overloaded to act almost exactly like the numbers
|
1829 |
+
C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
|
1830 |
+
the C<JSON::is_bool> function.
|
1831 |
+
|
1832 |
+
print JSON::true + 1;
|
1833 |
+
=> 1
|
1834 |
+
|
1835 |
+
ok(JSON::true eq '1');
|
1836 |
+
ok(JSON::true == 1);
|
1837 |
+
|
1838 |
+
C<JSON> will install these missing overloading features to the backend modules.
|
1839 |
+
|
1840 |
+
|
1841 |
+
=item null
|
1842 |
+
|
1843 |
+
A JSON null atom becomes C<undef> in Perl.
|
1844 |
+
|
1845 |
+
C<JSON::null> returns C<undef>.
|
1846 |
+
|
1847 |
+
=back
|
1848 |
+
|
1849 |
+
|
1850 |
+
=head2 PERL -> JSON
|
1851 |
+
|
1852 |
+
The mapping from Perl to JSON is slightly more difficult, as Perl is a
|
1853 |
+
truly typeless language, so we can only guess which JSON type is meant by
|
1854 |
+
a Perl value.
|
1855 |
+
|
1856 |
+
=over 4
|
1857 |
+
|
1858 |
+
=item hash references
|
1859 |
+
|
1860 |
+
Perl hash references become JSON objects. As there is no inherent ordering
|
1861 |
+
in hash keys (or JSON objects), they will usually be encoded in a
|
1862 |
+
pseudo-random order that can change between runs of the same program but
|
1863 |
+
stays generally the same within a single run of a program. C<JSON>
|
1864 |
+
optionally sort the hash keys (determined by the I<canonical> flag), so
|
1865 |
+
the same data structure will serialise to the same JSON text (given same
|
1866 |
+
settings and version of JSON::XS), but this incurs a runtime overhead
|
1867 |
+
and is only rarely useful, e.g. when you want to compare some JSON text
|
1868 |
+
against another for equality.
|
1869 |
+
|
1870 |
+
In future, the ordered object feature will be added to JSON::PP using C<tie> mechanism.
|
1871 |
+
|
1872 |
+
|
1873 |
+
=item array references
|
1874 |
+
|
1875 |
+
Perl array references become JSON arrays.
|
1876 |
+
|
1877 |
+
=item other references
|
1878 |
+
|
1879 |
+
Other unblessed references are generally not allowed and will cause an
|
1880 |
+
exception to be thrown, except for references to the integers C<0> and
|
1881 |
+
C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
|
1882 |
+
also use C<JSON::false> and C<JSON::true> to improve readability.
|
1883 |
+
|
1884 |
+
to_json [\0,JSON::true] # yields [false,true]
|
1885 |
+
|
1886 |
+
=item JSON::true, JSON::false, JSON::null
|
1887 |
+
|
1888 |
+
These special values become JSON true and JSON false values,
|
1889 |
+
respectively. You can also use C<\1> and C<\0> directly if you want.
|
1890 |
+
|
1891 |
+
JSON::null returns C<undef>.
|
1892 |
+
|
1893 |
+
=item blessed objects
|
1894 |
+
|
1895 |
+
Blessed objects are not directly representable in JSON. See the
|
1896 |
+
C<allow_blessed> and C<convert_blessed> methods on various options on
|
1897 |
+
how to deal with this: basically, you can choose between throwing an
|
1898 |
+
exception, encoding the reference as if it weren't blessed, or provide
|
1899 |
+
your own serialiser method.
|
1900 |
+
|
1901 |
+
With C<convert_blessed_universally> mode, C<encode> converts blessed
|
1902 |
+
hash references or blessed array references (contains other blessed references)
|
1903 |
+
into JSON members and arrays.
|
1904 |
+
|
1905 |
+
use JSON -convert_blessed_universally;
|
1906 |
+
JSON->new->allow_blessed->convert_blessed->encode( $blessed_object );
|
1907 |
+
|
1908 |
+
See to L<convert_blessed>.
|
1909 |
+
|
1910 |
+
=item simple scalars
|
1911 |
+
|
1912 |
+
Simple Perl scalars (any scalar that is not a reference) are the most
|
1913 |
+
difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
|
1914 |
+
JSON C<null> values, scalars that have last been used in a string context
|
1915 |
+
before encoding as JSON strings, and anything else as number value:
|
1916 |
+
|
1917 |
+
# dump as number
|
1918 |
+
encode_json [2] # yields [2]
|
1919 |
+
encode_json [-3.0e17] # yields [-3e+17]
|
1920 |
+
my $value = 5; encode_json [$value] # yields [5]
|
1921 |
+
|
1922 |
+
# used as string, so dump as string
|
1923 |
+
print $value;
|
1924 |
+
encode_json [$value] # yields ["5"]
|
1925 |
+
|
1926 |
+
# undef becomes null
|
1927 |
+
encode_json [undef] # yields [null]
|
1928 |
+
|
1929 |
+
You can force the type to be a string by stringifying it:
|
1930 |
+
|
1931 |
+
my $x = 3.1; # some variable containing a number
|
1932 |
+
"$x"; # stringified
|
1933 |
+
$x .= ""; # another, more awkward way to stringify
|
1934 |
+
print $x; # perl does it for you, too, quite often
|
1935 |
+
|
1936 |
+
You can force the type to be a number by numifying it:
|
1937 |
+
|
1938 |
+
my $x = "3"; # some variable containing a string
|
1939 |
+
$x += 0; # numify it, ensuring it will be dumped as a number
|
1940 |
+
$x *= 1; # same thing, the choice is yours.
|
1941 |
+
|
1942 |
+
You can not currently force the type in other, less obscure, ways.
|
1943 |
+
|
1944 |
+
Note that numerical precision has the same meaning as under Perl (so
|
1945 |
+
binary to decimal conversion follows the same rules as in Perl, which
|
1946 |
+
can differ to other languages). Also, your perl interpreter might expose
|
1947 |
+
extensions to the floating point numbers of your platform, such as
|
1948 |
+
infinities or NaN's - these cannot be represented in JSON, and it is an
|
1949 |
+
error to pass those in.
|
1950 |
+
|
1951 |
+
=item Big Number
|
1952 |
+
|
1953 |
+
If the backend is JSON::PP and C<allow_bignum> is enable,
|
1954 |
+
C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
1955 |
+
objects into JSON numbers.
|
1956 |
+
|
1957 |
+
|
1958 |
+
=back
|
1959 |
+
|
1960 |
+
=head1 JSON and ECMAscript
|
1961 |
+
|
1962 |
+
See to L<JSON::XS/JSON and ECMAscript>.
|
1963 |
+
|
1964 |
+
=head1 JSON and YAML
|
1965 |
+
|
1966 |
+
JSON is not a subset of YAML.
|
1967 |
+
See to L<JSON::XS/JSON and YAML>.
|
1968 |
+
|
1969 |
+
|
1970 |
+
=head1 BACKEND MODULE DECISION
|
1971 |
+
|
1972 |
+
When you use C<JSON>, C<JSON> tries to C<use> JSON::XS. If this call failed, it will
|
1973 |
+
C<uses> JSON::PP. The required JSON::XS version is I<2.2> or later.
|
1974 |
+
|
1975 |
+
The C<JSON> constructor method returns an object inherited from the backend module,
|
1976 |
+
and JSON::XS object is a blessed scalar reference while JSON::PP is a blessed hash
|
1977 |
+
reference.
|
1978 |
+
|
1979 |
+
So, your program should not depend on the backend module, especially
|
1980 |
+
returned objects should not be modified.
|
1981 |
+
|
1982 |
+
my $json = JSON->new; # XS or PP?
|
1983 |
+
$json->{stash} = 'this is xs object'; # this code may raise an error!
|
1984 |
+
|
1985 |
+
To check the backend module, there are some methods - C<backend>, C<is_pp> and C<is_xs>.
|
1986 |
+
|
1987 |
+
JSON->backend; # 'JSON::XS' or 'JSON::PP'
|
1988 |
+
|
1989 |
+
JSON->backend->is_pp: # 0 or 1
|
1990 |
+
|
1991 |
+
JSON->backend->is_xs: # 1 or 0
|
1992 |
+
|
1993 |
+
$json->is_xs; # 1 or 0
|
1994 |
+
|
1995 |
+
$json->is_pp; # 0 or 1
|
1996 |
+
|
1997 |
+
|
1998 |
+
If you set an environment variable C<PERL_JSON_BACKEND>, the calling action will be changed.
|
1999 |
+
|
2000 |
+
=over
|
2001 |
+
|
2002 |
+
=item PERL_JSON_BACKEND = 0 or PERL_JSON_BACKEND = 'JSON::PP'
|
2003 |
+
|
2004 |
+
Always use JSON::PP
|
2005 |
+
|
2006 |
+
=item PERL_JSON_BACKEND == 1 or PERL_JSON_BACKEND = 'JSON::XS,JSON::PP'
|
2007 |
+
|
2008 |
+
(The default) Use compiled JSON::XS if it is properly compiled & installed,
|
2009 |
+
otherwise use JSON::PP.
|
2010 |
+
|
2011 |
+
=item PERL_JSON_BACKEND == 2 or PERL_JSON_BACKEND = 'JSON::XS'
|
2012 |
+
|
2013 |
+
Always use compiled JSON::XS, die if it isn't properly compiled & installed.
|
2014 |
+
|
2015 |
+
=item PERL_JSON_BACKEND = 'JSON::backportPP'
|
2016 |
+
|
2017 |
+
Always use JSON::backportPP.
|
2018 |
+
JSON::backportPP is JSON::PP back port module.
|
2019 |
+
C<JSON> includes JSON::backportPP instead of JSON::PP.
|
2020 |
+
|
2021 |
+
=back
|
2022 |
+
|
2023 |
+
These ideas come from L<DBI::PurePerl> mechanism.
|
2024 |
+
|
2025 |
+
example:
|
2026 |
+
|
2027 |
+
BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::PP' }
|
2028 |
+
use JSON; # always uses JSON::PP
|
2029 |
+
|
2030 |
+
In future, it may be able to specify another module.
|
2031 |
+
|
2032 |
+
=head1 USE PP FEATURES EVEN THOUGH XS BACKEND
|
2033 |
+
|
2034 |
+
Many methods are available with either JSON::XS or JSON::PP and
|
2035 |
+
when the backend module is JSON::XS, if any JSON::PP specific (i.e. JSON::XS unsupported)
|
2036 |
+
method is called, it will C<warn> and be noop.
|
2037 |
+
|
2038 |
+
But If you C<use> C<JSON> passing the optional string C<-support_by_pp>,
|
2039 |
+
it makes a part of those unsupported methods available.
|
2040 |
+
This feature is achieved by using JSON::PP in C<de/encode>.
|
2041 |
+
|
2042 |
+
BEGIN { $ENV{PERL_JSON_BACKEND} = 2 } # with JSON::XS
|
2043 |
+
use JSON -support_by_pp;
|
2044 |
+
my $json = JSON->new;
|
2045 |
+
$json->allow_nonref->escape_slash->encode("/");
|
2046 |
+
|
2047 |
+
At this time, the returned object is a C<JSON::Backend::XS::Supportable>
|
2048 |
+
object (re-blessed XS object), and by checking JSON::XS unsupported flags
|
2049 |
+
in de/encoding, can support some unsupported methods - C<loose>, C<allow_bignum>,
|
2050 |
+
C<allow_barekey>, C<allow_singlequote>, C<escape_slash> and C<indent_length>.
|
2051 |
+
|
2052 |
+
When any unsupported methods are not enable, C<XS de/encode> will be
|
2053 |
+
used as is. The switch is achieved by changing the symbolic tables.
|
2054 |
+
|
2055 |
+
C<-support_by_pp> is effective only when the backend module is JSON::XS
|
2056 |
+
and it makes the de/encoding speed down a bit.
|
2057 |
+
|
2058 |
+
See to L<JSON::PP SUPPORT METHODS>.
|
2059 |
+
|
2060 |
+
=head1 INCOMPATIBLE CHANGES TO OLD VERSION
|
2061 |
+
|
2062 |
+
There are big incompatibility between new version (2.00) and old (1.xx).
|
2063 |
+
If you use old C<JSON> 1.xx in your code, please check it.
|
2064 |
+
|
2065 |
+
See to L<Transition ways from 1.xx to 2.xx.>
|
2066 |
+
|
2067 |
+
=over
|
2068 |
+
|
2069 |
+
=item jsonToObj and objToJson are obsoleted.
|
2070 |
+
|
2071 |
+
Non Perl-style name C<jsonToObj> and C<objToJson> are obsoleted
|
2072 |
+
(but not yet deleted from the source).
|
2073 |
+
If you use these functions in your code, please replace them
|
2074 |
+
with C<from_json> and C<to_json>.
|
2075 |
+
|
2076 |
+
|
2077 |
+
=item Global variables are no longer available.
|
2078 |
+
|
2079 |
+
C<JSON> class variables - C<$JSON::AUTOCONVERT>, C<$JSON::BareKey>, etc...
|
2080 |
+
- are not available any longer.
|
2081 |
+
Instead, various features can be used through object methods.
|
2082 |
+
|
2083 |
+
|
2084 |
+
=item Package JSON::Converter and JSON::Parser are deleted.
|
2085 |
+
|
2086 |
+
Now C<JSON> bundles with JSON::PP which can handle JSON more properly than them.
|
2087 |
+
|
2088 |
+
=item Package JSON::NotString is deleted.
|
2089 |
+
|
2090 |
+
There was C<JSON::NotString> class which represents JSON value C<true>, C<false>, C<null>
|
2091 |
+
and numbers. It was deleted and replaced by C<JSON::Boolean>.
|
2092 |
+
|
2093 |
+
C<JSON::Boolean> represents C<true> and C<false>.
|
2094 |
+
|
2095 |
+
C<JSON::Boolean> does not represent C<null>.
|
2096 |
+
|
2097 |
+
C<JSON::null> returns C<undef>.
|
2098 |
+
|
2099 |
+
C<JSON> makes L<JSON::XS::Boolean> and L<JSON::PP::Boolean> is-a relation
|
2100 |
+
to L<JSON::Boolean>.
|
2101 |
+
|
2102 |
+
=item function JSON::Number is obsoleted.
|
2103 |
+
|
2104 |
+
C<JSON::Number> is now needless because JSON::XS and JSON::PP have
|
2105 |
+
round-trip integrity.
|
2106 |
+
|
2107 |
+
=item JSONRPC modules are deleted.
|
2108 |
+
|
2109 |
+
Perl implementation of JSON-RPC protocol - C<JSONRPC >, C<JSONRPC::Transport::HTTP>
|
2110 |
+
and C<Apache::JSONRPC > are deleted in this distribution.
|
2111 |
+
Instead of them, there is L<JSON::RPC> which supports JSON-RPC protocol version 1.1.
|
2112 |
+
|
2113 |
+
=back
|
2114 |
+
|
2115 |
+
=head2 Transition ways from 1.xx to 2.xx.
|
2116 |
+
|
2117 |
+
You should set C<suport_by_pp> mode firstly, because
|
2118 |
+
it is always successful for the below codes even with JSON::XS.
|
2119 |
+
|
2120 |
+
use JSON -support_by_pp;
|
2121 |
+
|
2122 |
+
=over
|
2123 |
+
|
2124 |
+
=item Exported jsonToObj (simple)
|
2125 |
+
|
2126 |
+
from_json($json_text);
|
2127 |
+
|
2128 |
+
=item Exported objToJson (simple)
|
2129 |
+
|
2130 |
+
to_json($perl_scalar);
|
2131 |
+
|
2132 |
+
=item Exported jsonToObj (advanced)
|
2133 |
+
|
2134 |
+
$flags = {allow_barekey => 1, allow_singlequote => 1};
|
2135 |
+
from_json($json_text, $flags);
|
2136 |
+
|
2137 |
+
equivalent to:
|
2138 |
+
|
2139 |
+
$JSON::BareKey = 1;
|
2140 |
+
$JSON::QuotApos = 1;
|
2141 |
+
jsonToObj($json_text);
|
2142 |
+
|
2143 |
+
=item Exported objToJson (advanced)
|
2144 |
+
|
2145 |
+
$flags = {allow_blessed => 1, allow_barekey => 1};
|
2146 |
+
to_json($perl_scalar, $flags);
|
2147 |
+
|
2148 |
+
equivalent to:
|
2149 |
+
|
2150 |
+
$JSON::BareKey = 1;
|
2151 |
+
objToJson($perl_scalar);
|
2152 |
+
|
2153 |
+
=item jsonToObj as object method
|
2154 |
+
|
2155 |
+
$json->decode($json_text);
|
2156 |
+
|
2157 |
+
=item objToJson as object method
|
2158 |
+
|
2159 |
+
$json->encode($perl_scalar);
|
2160 |
+
|
2161 |
+
=item new method with parameters
|
2162 |
+
|
2163 |
+
The C<new> method in 2.x takes any parameters no longer.
|
2164 |
+
You can set parameters instead;
|
2165 |
+
|
2166 |
+
$json = JSON->new->pretty;
|
2167 |
+
|
2168 |
+
=item $JSON::Pretty, $JSON::Indent, $JSON::Delimiter
|
2169 |
+
|
2170 |
+
If C<indent> is enable, that means C<$JSON::Pretty> flag set. And
|
2171 |
+
C<$JSON::Delimiter> was substituted by C<space_before> and C<space_after>.
|
2172 |
+
In conclusion:
|
2173 |
+
|
2174 |
+
$json->indent->space_before->space_after;
|
2175 |
+
|
2176 |
+
Equivalent to:
|
2177 |
+
|
2178 |
+
$json->pretty;
|
2179 |
+
|
2180 |
+
To change indent length, use C<indent_length>.
|
2181 |
+
|
2182 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2183 |
+
|
2184 |
+
$json->pretty->indent_length(2)->encode($perl_scalar);
|
2185 |
+
|
2186 |
+
=item $JSON::BareKey
|
2187 |
+
|
2188 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2189 |
+
|
2190 |
+
$json->allow_barekey->decode($json_text)
|
2191 |
+
|
2192 |
+
=item $JSON::ConvBlessed
|
2193 |
+
|
2194 |
+
use C<-convert_blessed_universally>. See to L<convert_blessed>.
|
2195 |
+
|
2196 |
+
=item $JSON::QuotApos
|
2197 |
+
|
2198 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2199 |
+
|
2200 |
+
$json->allow_singlequote->decode($json_text)
|
2201 |
+
|
2202 |
+
=item $JSON::SingleQuote
|
2203 |
+
|
2204 |
+
Disable. C<JSON> does not make such a invalid JSON string any longer.
|
2205 |
+
|
2206 |
+
=item $JSON::KeySort
|
2207 |
+
|
2208 |
+
$json->canonical->encode($perl_scalar)
|
2209 |
+
|
2210 |
+
This is the ascii sort.
|
2211 |
+
|
2212 |
+
If you want to use with your own sort routine, check the C<sort_by> method.
|
2213 |
+
|
2214 |
+
(Only with JSON::PP, even if C<-support_by_pp> is used currently.)
|
2215 |
+
|
2216 |
+
$json->sort_by($sort_routine_ref)->encode($perl_scalar)
|
2217 |
+
|
2218 |
+
$json->sort_by(sub { $JSON::PP::a <=> $JSON::PP::b })->encode($perl_scalar)
|
2219 |
+
|
2220 |
+
Can't access C<$a> and C<$b> but C<$JSON::PP::a> and C<$JSON::PP::b>.
|
2221 |
+
|
2222 |
+
=item $JSON::SkipInvalid
|
2223 |
+
|
2224 |
+
$json->allow_unknown
|
2225 |
+
|
2226 |
+
=item $JSON::AUTOCONVERT
|
2227 |
+
|
2228 |
+
Needless. C<JSON> backend modules have the round-trip integrity.
|
2229 |
+
|
2230 |
+
=item $JSON::UTF8
|
2231 |
+
|
2232 |
+
Needless because C<JSON> (JSON::XS/JSON::PP) sets
|
2233 |
+
the UTF8 flag on properly.
|
2234 |
+
|
2235 |
+
# With UTF8-flagged strings
|
2236 |
+
|
2237 |
+
$json->allow_nonref;
|
2238 |
+
$str = chr(1000); # UTF8-flagged
|
2239 |
+
|
2240 |
+
$json_text = $json->utf8(0)->encode($str);
|
2241 |
+
utf8::is_utf8($json_text);
|
2242 |
+
# true
|
2243 |
+
$json_text = $json->utf8(1)->encode($str);
|
2244 |
+
utf8::is_utf8($json_text);
|
2245 |
+
# false
|
2246 |
+
|
2247 |
+
$str = '"' . chr(1000) . '"'; # UTF8-flagged
|
2248 |
+
|
2249 |
+
$perl_scalar = $json->utf8(0)->decode($str);
|
2250 |
+
utf8::is_utf8($perl_scalar);
|
2251 |
+
# true
|
2252 |
+
$perl_scalar = $json->utf8(1)->decode($str);
|
2253 |
+
# died because of 'Wide character in subroutine'
|
2254 |
+
|
2255 |
+
See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
|
2256 |
+
|
2257 |
+
=item $JSON::UnMapping
|
2258 |
+
|
2259 |
+
Disable. See to L<MAPPING>.
|
2260 |
+
|
2261 |
+
=item $JSON::SelfConvert
|
2262 |
+
|
2263 |
+
This option was deleted.
|
2264 |
+
Instead of it, if a given blessed object has the C<TO_JSON> method,
|
2265 |
+
C<TO_JSON> will be executed with C<convert_blessed>.
|
2266 |
+
|
2267 |
+
$json->convert_blessed->encode($blessed_hashref_or_arrayref)
|
2268 |
+
# if need, call allow_blessed
|
2269 |
+
|
2270 |
+
Note that it was C<toJson> in old version, but now not C<toJson> but C<TO_JSON>.
|
2271 |
+
|
2272 |
+
=back
|
2273 |
+
|
2274 |
+
=head1 TODO
|
2275 |
+
|
2276 |
+
=over
|
2277 |
+
|
2278 |
+
=item example programs
|
2279 |
+
|
2280 |
+
=back
|
2281 |
+
|
2282 |
+
=head1 THREADS
|
2283 |
+
|
2284 |
+
No test with JSON::PP. If with JSON::XS, See to L<JSON::XS/THREADS>.
|
2285 |
+
|
2286 |
+
|
2287 |
+
=head1 BUGS
|
2288 |
+
|
2289 |
+
Please report bugs relevant to C<JSON> to E<lt>makamaka[at]cpan.orgE<gt>.
|
2290 |
+
|
2291 |
+
|
2292 |
+
=head1 SEE ALSO
|
2293 |
+
|
2294 |
+
Most of the document is copied and modified from JSON::XS doc.
|
2295 |
+
|
2296 |
+
L<JSON::XS>, L<JSON::PP>
|
2297 |
+
|
2298 |
+
C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>)
|
2299 |
+
|
2300 |
+
=head1 AUTHOR
|
2301 |
+
|
2302 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
2303 |
+
|
2304 |
+
JSON::XS was written by Marc Lehmann <schmorp[at]schmorp.de>
|
2305 |
+
|
2306 |
+
The release of this new version owes to the courtesy of Marc Lehmann.
|
2307 |
+
|
2308 |
+
|
2309 |
+
=head1 COPYRIGHT AND LICENSE
|
2310 |
+
|
2311 |
+
Copyright 2005-2013 by Makamaka Hannyaharamitu
|
2312 |
+
|
2313 |
+
This library is free software; you can redistribute it and/or modify
|
2314 |
+
it under the same terms as Perl itself.
|
2315 |
+
|
2316 |
+
=cut
|
2317 |
+
|
uroman/lib/JSON/backportPP.pm
ADDED
@@ -0,0 +1,2806 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::PP;
|
3 |
+
|
4 |
+
# JSON-2.0
|
5 |
+
|
6 |
+
use 5.005;
|
7 |
+
use strict;
|
8 |
+
use base qw(Exporter);
|
9 |
+
use overload ();
|
10 |
+
|
11 |
+
use Carp ();
|
12 |
+
use B ();
|
13 |
+
#use Devel::Peek;
|
14 |
+
|
15 |
+
use vars qw($VERSION);
|
16 |
+
$VERSION = '2.27204';
|
17 |
+
|
18 |
+
@JSON::PP::EXPORT = qw(encode_json decode_json from_json to_json);
|
19 |
+
|
20 |
+
# instead of hash-access, i tried index-access for speed.
|
21 |
+
# but this method is not faster than what i expected. so it will be changed.
|
22 |
+
|
23 |
+
use constant P_ASCII => 0;
|
24 |
+
use constant P_LATIN1 => 1;
|
25 |
+
use constant P_UTF8 => 2;
|
26 |
+
use constant P_INDENT => 3;
|
27 |
+
use constant P_CANONICAL => 4;
|
28 |
+
use constant P_SPACE_BEFORE => 5;
|
29 |
+
use constant P_SPACE_AFTER => 6;
|
30 |
+
use constant P_ALLOW_NONREF => 7;
|
31 |
+
use constant P_SHRINK => 8;
|
32 |
+
use constant P_ALLOW_BLESSED => 9;
|
33 |
+
use constant P_CONVERT_BLESSED => 10;
|
34 |
+
use constant P_RELAXED => 11;
|
35 |
+
|
36 |
+
use constant P_LOOSE => 12;
|
37 |
+
use constant P_ALLOW_BIGNUM => 13;
|
38 |
+
use constant P_ALLOW_BAREKEY => 14;
|
39 |
+
use constant P_ALLOW_SINGLEQUOTE => 15;
|
40 |
+
use constant P_ESCAPE_SLASH => 16;
|
41 |
+
use constant P_AS_NONBLESSED => 17;
|
42 |
+
|
43 |
+
use constant P_ALLOW_UNKNOWN => 18;
|
44 |
+
|
45 |
+
use constant OLD_PERL => $] < 5.008 ? 1 : 0;
|
46 |
+
|
47 |
+
BEGIN {
|
48 |
+
my @xs_compati_bit_properties = qw(
|
49 |
+
latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink
|
50 |
+
allow_blessed convert_blessed relaxed allow_unknown
|
51 |
+
);
|
52 |
+
my @pp_bit_properties = qw(
|
53 |
+
allow_singlequote allow_bignum loose
|
54 |
+
allow_barekey escape_slash as_nonblessed
|
55 |
+
);
|
56 |
+
|
57 |
+
# Perl version check, Unicode handling is enable?
|
58 |
+
# Helper module sets @JSON::PP::_properties.
|
59 |
+
if ($] < 5.008 ) {
|
60 |
+
my $helper = $] >= 5.006 ? 'JSON::backportPP::Compat5006' : 'JSON::backportPP::Compat5005';
|
61 |
+
eval qq| require $helper |;
|
62 |
+
if ($@) { Carp::croak $@; }
|
63 |
+
}
|
64 |
+
|
65 |
+
for my $name (@xs_compati_bit_properties, @pp_bit_properties) {
|
66 |
+
my $flag_name = 'P_' . uc($name);
|
67 |
+
|
68 |
+
eval qq/
|
69 |
+
sub $name {
|
70 |
+
my \$enable = defined \$_[1] ? \$_[1] : 1;
|
71 |
+
|
72 |
+
if (\$enable) {
|
73 |
+
\$_[0]->{PROPS}->[$flag_name] = 1;
|
74 |
+
}
|
75 |
+
else {
|
76 |
+
\$_[0]->{PROPS}->[$flag_name] = 0;
|
77 |
+
}
|
78 |
+
|
79 |
+
\$_[0];
|
80 |
+
}
|
81 |
+
|
82 |
+
sub get_$name {
|
83 |
+
\$_[0]->{PROPS}->[$flag_name] ? 1 : '';
|
84 |
+
}
|
85 |
+
/;
|
86 |
+
}
|
87 |
+
|
88 |
+
}
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
# Functions
|
93 |
+
|
94 |
+
my %encode_allow_method
|
95 |
+
= map {($_ => 1)} qw/utf8 pretty allow_nonref latin1 self_encode escape_slash
|
96 |
+
allow_blessed convert_blessed indent indent_length allow_bignum
|
97 |
+
as_nonblessed
|
98 |
+
/;
|
99 |
+
my %decode_allow_method
|
100 |
+
= map {($_ => 1)} qw/utf8 allow_nonref loose allow_singlequote allow_bignum
|
101 |
+
allow_barekey max_size relaxed/;
|
102 |
+
|
103 |
+
|
104 |
+
my $JSON; # cache
|
105 |
+
|
106 |
+
sub encode_json ($) { # encode
|
107 |
+
($JSON ||= __PACKAGE__->new->utf8)->encode(@_);
|
108 |
+
}
|
109 |
+
|
110 |
+
|
111 |
+
sub decode_json { # decode
|
112 |
+
($JSON ||= __PACKAGE__->new->utf8)->decode(@_);
|
113 |
+
}
|
114 |
+
|
115 |
+
# Obsoleted
|
116 |
+
|
117 |
+
sub to_json($) {
|
118 |
+
Carp::croak ("JSON::PP::to_json has been renamed to encode_json.");
|
119 |
+
}
|
120 |
+
|
121 |
+
|
122 |
+
sub from_json($) {
|
123 |
+
Carp::croak ("JSON::PP::from_json has been renamed to decode_json.");
|
124 |
+
}
|
125 |
+
|
126 |
+
|
127 |
+
# Methods
|
128 |
+
|
129 |
+
sub new {
|
130 |
+
my $class = shift;
|
131 |
+
my $self = {
|
132 |
+
max_depth => 512,
|
133 |
+
max_size => 0,
|
134 |
+
indent => 0,
|
135 |
+
FLAGS => 0,
|
136 |
+
fallback => sub { encode_error('Invalid value. JSON can only reference.') },
|
137 |
+
indent_length => 3,
|
138 |
+
};
|
139 |
+
|
140 |
+
bless $self, $class;
|
141 |
+
}
|
142 |
+
|
143 |
+
|
144 |
+
sub encode {
|
145 |
+
return $_[0]->PP_encode_json($_[1]);
|
146 |
+
}
|
147 |
+
|
148 |
+
|
149 |
+
sub decode {
|
150 |
+
return $_[0]->PP_decode_json($_[1], 0x00000000);
|
151 |
+
}
|
152 |
+
|
153 |
+
|
154 |
+
sub decode_prefix {
|
155 |
+
return $_[0]->PP_decode_json($_[1], 0x00000001);
|
156 |
+
}
|
157 |
+
|
158 |
+
|
159 |
+
# accessor
|
160 |
+
|
161 |
+
|
162 |
+
# pretty printing
|
163 |
+
|
164 |
+
sub pretty {
|
165 |
+
my ($self, $v) = @_;
|
166 |
+
my $enable = defined $v ? $v : 1;
|
167 |
+
|
168 |
+
if ($enable) { # indent_length(3) for JSON::XS compatibility
|
169 |
+
$self->indent(1)->indent_length(3)->space_before(1)->space_after(1);
|
170 |
+
}
|
171 |
+
else {
|
172 |
+
$self->indent(0)->space_before(0)->space_after(0);
|
173 |
+
}
|
174 |
+
|
175 |
+
$self;
|
176 |
+
}
|
177 |
+
|
178 |
+
# etc
|
179 |
+
|
180 |
+
sub max_depth {
|
181 |
+
my $max = defined $_[1] ? $_[1] : 0x80000000;
|
182 |
+
$_[0]->{max_depth} = $max;
|
183 |
+
$_[0];
|
184 |
+
}
|
185 |
+
|
186 |
+
|
187 |
+
sub get_max_depth { $_[0]->{max_depth}; }
|
188 |
+
|
189 |
+
|
190 |
+
sub max_size {
|
191 |
+
my $max = defined $_[1] ? $_[1] : 0;
|
192 |
+
$_[0]->{max_size} = $max;
|
193 |
+
$_[0];
|
194 |
+
}
|
195 |
+
|
196 |
+
|
197 |
+
sub get_max_size { $_[0]->{max_size}; }
|
198 |
+
|
199 |
+
|
200 |
+
sub filter_json_object {
|
201 |
+
$_[0]->{cb_object} = defined $_[1] ? $_[1] : 0;
|
202 |
+
$_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
|
203 |
+
$_[0];
|
204 |
+
}
|
205 |
+
|
206 |
+
sub filter_json_single_key_object {
|
207 |
+
if (@_ > 1) {
|
208 |
+
$_[0]->{cb_sk_object}->{$_[1]} = $_[2];
|
209 |
+
}
|
210 |
+
$_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
|
211 |
+
$_[0];
|
212 |
+
}
|
213 |
+
|
214 |
+
sub indent_length {
|
215 |
+
if (!defined $_[1] or $_[1] > 15 or $_[1] < 0) {
|
216 |
+
Carp::carp "The acceptable range of indent_length() is 0 to 15.";
|
217 |
+
}
|
218 |
+
else {
|
219 |
+
$_[0]->{indent_length} = $_[1];
|
220 |
+
}
|
221 |
+
$_[0];
|
222 |
+
}
|
223 |
+
|
224 |
+
sub get_indent_length {
|
225 |
+
$_[0]->{indent_length};
|
226 |
+
}
|
227 |
+
|
228 |
+
sub sort_by {
|
229 |
+
$_[0]->{sort_by} = defined $_[1] ? $_[1] : 1;
|
230 |
+
$_[0];
|
231 |
+
}
|
232 |
+
|
233 |
+
sub allow_bigint {
|
234 |
+
Carp::carp("allow_bigint() is obsoleted. use allow_bignum() insted.");
|
235 |
+
}
|
236 |
+
|
237 |
+
###############################
|
238 |
+
|
239 |
+
###
|
240 |
+
### Perl => JSON
|
241 |
+
###
|
242 |
+
|
243 |
+
|
244 |
+
{ # Convert
|
245 |
+
|
246 |
+
my $max_depth;
|
247 |
+
my $indent;
|
248 |
+
my $ascii;
|
249 |
+
my $latin1;
|
250 |
+
my $utf8;
|
251 |
+
my $space_before;
|
252 |
+
my $space_after;
|
253 |
+
my $canonical;
|
254 |
+
my $allow_blessed;
|
255 |
+
my $convert_blessed;
|
256 |
+
|
257 |
+
my $indent_length;
|
258 |
+
my $escape_slash;
|
259 |
+
my $bignum;
|
260 |
+
my $as_nonblessed;
|
261 |
+
|
262 |
+
my $depth;
|
263 |
+
my $indent_count;
|
264 |
+
my $keysort;
|
265 |
+
|
266 |
+
|
267 |
+
sub PP_encode_json {
|
268 |
+
my $self = shift;
|
269 |
+
my $obj = shift;
|
270 |
+
|
271 |
+
$indent_count = 0;
|
272 |
+
$depth = 0;
|
273 |
+
|
274 |
+
my $idx = $self->{PROPS};
|
275 |
+
|
276 |
+
($ascii, $latin1, $utf8, $indent, $canonical, $space_before, $space_after, $allow_blessed,
|
277 |
+
$convert_blessed, $escape_slash, $bignum, $as_nonblessed)
|
278 |
+
= @{$idx}[P_ASCII .. P_SPACE_AFTER, P_ALLOW_BLESSED, P_CONVERT_BLESSED,
|
279 |
+
P_ESCAPE_SLASH, P_ALLOW_BIGNUM, P_AS_NONBLESSED];
|
280 |
+
|
281 |
+
($max_depth, $indent_length) = @{$self}{qw/max_depth indent_length/};
|
282 |
+
|
283 |
+
$keysort = $canonical ? sub { $a cmp $b } : undef;
|
284 |
+
|
285 |
+
if ($self->{sort_by}) {
|
286 |
+
$keysort = ref($self->{sort_by}) eq 'CODE' ? $self->{sort_by}
|
287 |
+
: $self->{sort_by} =~ /\D+/ ? $self->{sort_by}
|
288 |
+
: sub { $a cmp $b };
|
289 |
+
}
|
290 |
+
|
291 |
+
encode_error("hash- or arrayref expected (not a simple scalar, use allow_nonref to allow this)")
|
292 |
+
if(!ref $obj and !$idx->[ P_ALLOW_NONREF ]);
|
293 |
+
|
294 |
+
my $str = $self->object_to_json($obj);
|
295 |
+
|
296 |
+
$str .= "\n" if ( $indent ); # JSON::XS 2.26 compatible
|
297 |
+
|
298 |
+
unless ($ascii or $latin1 or $utf8) {
|
299 |
+
utf8::upgrade($str);
|
300 |
+
}
|
301 |
+
|
302 |
+
if ($idx->[ P_SHRINK ]) {
|
303 |
+
utf8::downgrade($str, 1);
|
304 |
+
}
|
305 |
+
|
306 |
+
return $str;
|
307 |
+
}
|
308 |
+
|
309 |
+
|
310 |
+
sub object_to_json {
|
311 |
+
my ($self, $obj) = @_;
|
312 |
+
my $type = ref($obj);
|
313 |
+
|
314 |
+
if($type eq 'HASH'){
|
315 |
+
return $self->hash_to_json($obj);
|
316 |
+
}
|
317 |
+
elsif($type eq 'ARRAY'){
|
318 |
+
return $self->array_to_json($obj);
|
319 |
+
}
|
320 |
+
elsif ($type) { # blessed object?
|
321 |
+
if (blessed($obj)) {
|
322 |
+
|
323 |
+
return $self->value_to_json($obj) if ( $obj->isa('JSON::PP::Boolean') );
|
324 |
+
|
325 |
+
if ( $convert_blessed and $obj->can('TO_JSON') ) {
|
326 |
+
my $result = $obj->TO_JSON();
|
327 |
+
if ( defined $result and ref( $result ) ) {
|
328 |
+
if ( refaddr( $obj ) eq refaddr( $result ) ) {
|
329 |
+
encode_error( sprintf(
|
330 |
+
"%s::TO_JSON method returned same object as was passed instead of a new one",
|
331 |
+
ref $obj
|
332 |
+
) );
|
333 |
+
}
|
334 |
+
}
|
335 |
+
|
336 |
+
return $self->object_to_json( $result );
|
337 |
+
}
|
338 |
+
|
339 |
+
return "$obj" if ( $bignum and _is_bignum($obj) );
|
340 |
+
return $self->blessed_to_json($obj) if ($allow_blessed and $as_nonblessed); # will be removed.
|
341 |
+
|
342 |
+
encode_error( sprintf("encountered object '%s', but neither allow_blessed "
|
343 |
+
. "nor convert_blessed settings are enabled", $obj)
|
344 |
+
) unless ($allow_blessed);
|
345 |
+
|
346 |
+
return 'null';
|
347 |
+
}
|
348 |
+
else {
|
349 |
+
return $self->value_to_json($obj);
|
350 |
+
}
|
351 |
+
}
|
352 |
+
else{
|
353 |
+
return $self->value_to_json($obj);
|
354 |
+
}
|
355 |
+
}
|
356 |
+
|
357 |
+
|
358 |
+
sub hash_to_json {
|
359 |
+
my ($self, $obj) = @_;
|
360 |
+
my @res;
|
361 |
+
|
362 |
+
encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
|
363 |
+
if (++$depth > $max_depth);
|
364 |
+
|
365 |
+
my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
|
366 |
+
my $del = ($space_before ? ' ' : '') . ':' . ($space_after ? ' ' : '');
|
367 |
+
|
368 |
+
for my $k ( _sort( $obj ) ) {
|
369 |
+
if ( OLD_PERL ) { utf8::decode($k) } # key for Perl 5.6 / be optimized
|
370 |
+
push @res, string_to_json( $self, $k )
|
371 |
+
. $del
|
372 |
+
. ( $self->object_to_json( $obj->{$k} ) || $self->value_to_json( $obj->{$k} ) );
|
373 |
+
}
|
374 |
+
|
375 |
+
--$depth;
|
376 |
+
$self->_down_indent() if ($indent);
|
377 |
+
|
378 |
+
return '{' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . '}';
|
379 |
+
}
|
380 |
+
|
381 |
+
|
382 |
+
sub array_to_json {
|
383 |
+
my ($self, $obj) = @_;
|
384 |
+
my @res;
|
385 |
+
|
386 |
+
encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
|
387 |
+
if (++$depth > $max_depth);
|
388 |
+
|
389 |
+
my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
|
390 |
+
|
391 |
+
for my $v (@$obj){
|
392 |
+
push @res, $self->object_to_json($v) || $self->value_to_json($v);
|
393 |
+
}
|
394 |
+
|
395 |
+
--$depth;
|
396 |
+
$self->_down_indent() if ($indent);
|
397 |
+
|
398 |
+
return '[' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . ']';
|
399 |
+
}
|
400 |
+
|
401 |
+
|
402 |
+
sub value_to_json {
|
403 |
+
my ($self, $value) = @_;
|
404 |
+
|
405 |
+
return 'null' if(!defined $value);
|
406 |
+
|
407 |
+
my $b_obj = B::svref_2object(\$value); # for round trip problem
|
408 |
+
my $flags = $b_obj->FLAGS;
|
409 |
+
|
410 |
+
return $value # as is
|
411 |
+
if $flags & ( B::SVp_IOK | B::SVp_NOK ) and !( $flags & B::SVp_POK ); # SvTYPE is IV or NV?
|
412 |
+
|
413 |
+
my $type = ref($value);
|
414 |
+
|
415 |
+
if(!$type){
|
416 |
+
return string_to_json($self, $value);
|
417 |
+
}
|
418 |
+
elsif( blessed($value) and $value->isa('JSON::PP::Boolean') ){
|
419 |
+
return $$value == 1 ? 'true' : 'false';
|
420 |
+
}
|
421 |
+
elsif ($type) {
|
422 |
+
if ((overload::StrVal($value) =~ /=(\w+)/)[0]) {
|
423 |
+
return $self->value_to_json("$value");
|
424 |
+
}
|
425 |
+
|
426 |
+
if ($type eq 'SCALAR' and defined $$value) {
|
427 |
+
return $$value eq '1' ? 'true'
|
428 |
+
: $$value eq '0' ? 'false'
|
429 |
+
: $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ? 'null'
|
430 |
+
: encode_error("cannot encode reference to scalar");
|
431 |
+
}
|
432 |
+
|
433 |
+
if ( $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ) {
|
434 |
+
return 'null';
|
435 |
+
}
|
436 |
+
else {
|
437 |
+
if ( $type eq 'SCALAR' or $type eq 'REF' ) {
|
438 |
+
encode_error("cannot encode reference to scalar");
|
439 |
+
}
|
440 |
+
else {
|
441 |
+
encode_error("encountered $value, but JSON can only represent references to arrays or hashes");
|
442 |
+
}
|
443 |
+
}
|
444 |
+
|
445 |
+
}
|
446 |
+
else {
|
447 |
+
return $self->{fallback}->($value)
|
448 |
+
if ($self->{fallback} and ref($self->{fallback}) eq 'CODE');
|
449 |
+
return 'null';
|
450 |
+
}
|
451 |
+
|
452 |
+
}
|
453 |
+
|
454 |
+
|
455 |
+
my %esc = (
|
456 |
+
"\n" => '\n',
|
457 |
+
"\r" => '\r',
|
458 |
+
"\t" => '\t',
|
459 |
+
"\f" => '\f',
|
460 |
+
"\b" => '\b',
|
461 |
+
"\"" => '\"',
|
462 |
+
"\\" => '\\\\',
|
463 |
+
"\'" => '\\\'',
|
464 |
+
);
|
465 |
+
|
466 |
+
|
467 |
+
sub string_to_json {
|
468 |
+
my ($self, $arg) = @_;
|
469 |
+
|
470 |
+
$arg =~ s/([\x22\x5c\n\r\t\f\b])/$esc{$1}/g;
|
471 |
+
$arg =~ s/\//\\\//g if ($escape_slash);
|
472 |
+
$arg =~ s/([\x00-\x08\x0b\x0e-\x1f])/'\\u00' . unpack('H2', $1)/eg;
|
473 |
+
|
474 |
+
if ($ascii) {
|
475 |
+
$arg = JSON_PP_encode_ascii($arg);
|
476 |
+
}
|
477 |
+
|
478 |
+
if ($latin1) {
|
479 |
+
$arg = JSON_PP_encode_latin1($arg);
|
480 |
+
}
|
481 |
+
|
482 |
+
if ($utf8) {
|
483 |
+
utf8::encode($arg);
|
484 |
+
}
|
485 |
+
|
486 |
+
return '"' . $arg . '"';
|
487 |
+
}
|
488 |
+
|
489 |
+
|
490 |
+
sub blessed_to_json {
|
491 |
+
my $reftype = reftype($_[1]) || '';
|
492 |
+
if ($reftype eq 'HASH') {
|
493 |
+
return $_[0]->hash_to_json($_[1]);
|
494 |
+
}
|
495 |
+
elsif ($reftype eq 'ARRAY') {
|
496 |
+
return $_[0]->array_to_json($_[1]);
|
497 |
+
}
|
498 |
+
else {
|
499 |
+
return 'null';
|
500 |
+
}
|
501 |
+
}
|
502 |
+
|
503 |
+
|
504 |
+
sub encode_error {
|
505 |
+
my $error = shift;
|
506 |
+
Carp::croak "$error";
|
507 |
+
}
|
508 |
+
|
509 |
+
|
510 |
+
sub _sort {
|
511 |
+
defined $keysort ? (sort $keysort (keys %{$_[0]})) : keys %{$_[0]};
|
512 |
+
}
|
513 |
+
|
514 |
+
|
515 |
+
sub _up_indent {
|
516 |
+
my $self = shift;
|
517 |
+
my $space = ' ' x $indent_length;
|
518 |
+
|
519 |
+
my ($pre,$post) = ('','');
|
520 |
+
|
521 |
+
$post = "\n" . $space x $indent_count;
|
522 |
+
|
523 |
+
$indent_count++;
|
524 |
+
|
525 |
+
$pre = "\n" . $space x $indent_count;
|
526 |
+
|
527 |
+
return ($pre,$post);
|
528 |
+
}
|
529 |
+
|
530 |
+
|
531 |
+
sub _down_indent { $indent_count--; }
|
532 |
+
|
533 |
+
|
534 |
+
sub PP_encode_box {
|
535 |
+
{
|
536 |
+
depth => $depth,
|
537 |
+
indent_count => $indent_count,
|
538 |
+
};
|
539 |
+
}
|
540 |
+
|
541 |
+
} # Convert
|
542 |
+
|
543 |
+
|
544 |
+
sub _encode_ascii {
|
545 |
+
join('',
|
546 |
+
map {
|
547 |
+
$_ <= 127 ?
|
548 |
+
chr($_) :
|
549 |
+
$_ <= 65535 ?
|
550 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
|
551 |
+
} unpack('U*', $_[0])
|
552 |
+
);
|
553 |
+
}
|
554 |
+
|
555 |
+
|
556 |
+
sub _encode_latin1 {
|
557 |
+
join('',
|
558 |
+
map {
|
559 |
+
$_ <= 255 ?
|
560 |
+
chr($_) :
|
561 |
+
$_ <= 65535 ?
|
562 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
|
563 |
+
} unpack('U*', $_[0])
|
564 |
+
);
|
565 |
+
}
|
566 |
+
|
567 |
+
|
568 |
+
sub _encode_surrogates { # from perlunicode
|
569 |
+
my $uni = $_[0] - 0x10000;
|
570 |
+
return ($uni / 0x400 + 0xD800, $uni % 0x400 + 0xDC00);
|
571 |
+
}
|
572 |
+
|
573 |
+
|
574 |
+
sub _is_bignum {
|
575 |
+
$_[0]->isa('Math::BigInt') or $_[0]->isa('Math::BigFloat');
|
576 |
+
}
|
577 |
+
|
578 |
+
|
579 |
+
|
580 |
+
#
|
581 |
+
# JSON => Perl
|
582 |
+
#
|
583 |
+
|
584 |
+
my $max_intsize;
|
585 |
+
|
586 |
+
BEGIN {
|
587 |
+
my $checkint = 1111;
|
588 |
+
for my $d (5..64) {
|
589 |
+
$checkint .= 1;
|
590 |
+
my $int = eval qq| $checkint |;
|
591 |
+
if ($int =~ /[eE]/) {
|
592 |
+
$max_intsize = $d - 1;
|
593 |
+
last;
|
594 |
+
}
|
595 |
+
}
|
596 |
+
}
|
597 |
+
|
598 |
+
{ # PARSE
|
599 |
+
|
600 |
+
my %escapes = ( # by Jeremy Muhlich <jmuhlich [at] bitflood.org>
|
601 |
+
b => "\x8",
|
602 |
+
t => "\x9",
|
603 |
+
n => "\xA",
|
604 |
+
f => "\xC",
|
605 |
+
r => "\xD",
|
606 |
+
'\\' => '\\',
|
607 |
+
'"' => '"',
|
608 |
+
'/' => '/',
|
609 |
+
);
|
610 |
+
|
611 |
+
my $text; # json data
|
612 |
+
my $at; # offset
|
613 |
+
my $ch; # 1chracter
|
614 |
+
my $len; # text length (changed according to UTF8 or NON UTF8)
|
615 |
+
# INTERNAL
|
616 |
+
my $depth; # nest counter
|
617 |
+
my $encoding; # json text encoding
|
618 |
+
my $is_valid_utf8; # temp variable
|
619 |
+
my $utf8_len; # utf8 byte length
|
620 |
+
# FLAGS
|
621 |
+
my $utf8; # must be utf8
|
622 |
+
my $max_depth; # max nest number of objects and arrays
|
623 |
+
my $max_size;
|
624 |
+
my $relaxed;
|
625 |
+
my $cb_object;
|
626 |
+
my $cb_sk_object;
|
627 |
+
|
628 |
+
my $F_HOOK;
|
629 |
+
|
630 |
+
my $allow_bigint; # using Math::BigInt
|
631 |
+
my $singlequote; # loosely quoting
|
632 |
+
my $loose; #
|
633 |
+
my $allow_barekey; # bareKey
|
634 |
+
|
635 |
+
# $opt flag
|
636 |
+
# 0x00000001 .... decode_prefix
|
637 |
+
# 0x10000000 .... incr_parse
|
638 |
+
|
639 |
+
sub PP_decode_json {
|
640 |
+
my ($self, $opt); # $opt is an effective flag during this decode_json.
|
641 |
+
|
642 |
+
($self, $text, $opt) = @_;
|
643 |
+
|
644 |
+
($at, $ch, $depth) = (0, '', 0);
|
645 |
+
|
646 |
+
if ( !defined $text or ref $text ) {
|
647 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
648 |
+
}
|
649 |
+
|
650 |
+
my $idx = $self->{PROPS};
|
651 |
+
|
652 |
+
($utf8, $relaxed, $loose, $allow_bigint, $allow_barekey, $singlequote)
|
653 |
+
= @{$idx}[P_UTF8, P_RELAXED, P_LOOSE .. P_ALLOW_SINGLEQUOTE];
|
654 |
+
|
655 |
+
if ( $utf8 ) {
|
656 |
+
utf8::downgrade( $text, 1 ) or Carp::croak("Wide character in subroutine entry");
|
657 |
+
}
|
658 |
+
else {
|
659 |
+
utf8::upgrade( $text );
|
660 |
+
}
|
661 |
+
|
662 |
+
$len = length $text;
|
663 |
+
|
664 |
+
($max_depth, $max_size, $cb_object, $cb_sk_object, $F_HOOK)
|
665 |
+
= @{$self}{qw/max_depth max_size cb_object cb_sk_object F_HOOK/};
|
666 |
+
|
667 |
+
if ($max_size > 1) {
|
668 |
+
use bytes;
|
669 |
+
my $bytes = length $text;
|
670 |
+
decode_error(
|
671 |
+
sprintf("attempted decode of JSON text of %s bytes size, but max_size is set to %s"
|
672 |
+
, $bytes, $max_size), 1
|
673 |
+
) if ($bytes > $max_size);
|
674 |
+
}
|
675 |
+
|
676 |
+
# Currently no effect
|
677 |
+
# should use regexp
|
678 |
+
my @octets = unpack('C4', $text);
|
679 |
+
$encoding = ( $octets[0] and $octets[1]) ? 'UTF-8'
|
680 |
+
: (!$octets[0] and $octets[1]) ? 'UTF-16BE'
|
681 |
+
: (!$octets[0] and !$octets[1]) ? 'UTF-32BE'
|
682 |
+
: ( $octets[2] ) ? 'UTF-16LE'
|
683 |
+
: (!$octets[2] ) ? 'UTF-32LE'
|
684 |
+
: 'unknown';
|
685 |
+
|
686 |
+
white(); # remove head white space
|
687 |
+
|
688 |
+
my $valid_start = defined $ch; # Is there a first character for JSON structure?
|
689 |
+
|
690 |
+
my $result = value();
|
691 |
+
|
692 |
+
return undef if ( !$result && ( $opt & 0x10000000 ) ); # for incr_parse
|
693 |
+
|
694 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom") unless $valid_start;
|
695 |
+
|
696 |
+
if ( !$idx->[ P_ALLOW_NONREF ] and !ref $result ) {
|
697 |
+
decode_error(
|
698 |
+
'JSON text must be an object or array (but found number, string, true, false or null,'
|
699 |
+
. ' use allow_nonref to allow this)', 1);
|
700 |
+
}
|
701 |
+
|
702 |
+
Carp::croak('something wrong.') if $len < $at; # we won't arrive here.
|
703 |
+
|
704 |
+
my $consumed = defined $ch ? $at - 1 : $at; # consumed JSON text length
|
705 |
+
|
706 |
+
white(); # remove tail white space
|
707 |
+
|
708 |
+
if ( $ch ) {
|
709 |
+
return ( $result, $consumed ) if ($opt & 0x00000001); # all right if decode_prefix
|
710 |
+
decode_error("garbage after JSON object");
|
711 |
+
}
|
712 |
+
|
713 |
+
( $opt & 0x00000001 ) ? ( $result, $consumed ) : $result;
|
714 |
+
}
|
715 |
+
|
716 |
+
|
717 |
+
sub next_chr {
|
718 |
+
return $ch = undef if($at >= $len);
|
719 |
+
$ch = substr($text, $at++, 1);
|
720 |
+
}
|
721 |
+
|
722 |
+
|
723 |
+
sub value {
|
724 |
+
white();
|
725 |
+
return if(!defined $ch);
|
726 |
+
return object() if($ch eq '{');
|
727 |
+
return array() if($ch eq '[');
|
728 |
+
return string() if($ch eq '"' or ($singlequote and $ch eq "'"));
|
729 |
+
return number() if($ch =~ /[0-9]/ or $ch eq '-');
|
730 |
+
return word();
|
731 |
+
}
|
732 |
+
|
733 |
+
sub string {
|
734 |
+
my ($i, $s, $t, $u);
|
735 |
+
my $utf16;
|
736 |
+
my $is_utf8;
|
737 |
+
|
738 |
+
($is_valid_utf8, $utf8_len) = ('', 0);
|
739 |
+
|
740 |
+
$s = ''; # basically UTF8 flag on
|
741 |
+
|
742 |
+
if($ch eq '"' or ($singlequote and $ch eq "'")){
|
743 |
+
my $boundChar = $ch;
|
744 |
+
|
745 |
+
OUTER: while( defined(next_chr()) ){
|
746 |
+
|
747 |
+
if($ch eq $boundChar){
|
748 |
+
next_chr();
|
749 |
+
|
750 |
+
if ($utf16) {
|
751 |
+
decode_error("missing low surrogate character in surrogate pair");
|
752 |
+
}
|
753 |
+
|
754 |
+
utf8::decode($s) if($is_utf8);
|
755 |
+
|
756 |
+
return $s;
|
757 |
+
}
|
758 |
+
elsif($ch eq '\\'){
|
759 |
+
next_chr();
|
760 |
+
if(exists $escapes{$ch}){
|
761 |
+
$s .= $escapes{$ch};
|
762 |
+
}
|
763 |
+
elsif($ch eq 'u'){ # UNICODE handling
|
764 |
+
my $u = '';
|
765 |
+
|
766 |
+
for(1..4){
|
767 |
+
$ch = next_chr();
|
768 |
+
last OUTER if($ch !~ /[0-9a-fA-F]/);
|
769 |
+
$u .= $ch;
|
770 |
+
}
|
771 |
+
|
772 |
+
# U+D800 - U+DBFF
|
773 |
+
if ($u =~ /^[dD][89abAB][0-9a-fA-F]{2}/) { # UTF-16 high surrogate?
|
774 |
+
$utf16 = $u;
|
775 |
+
}
|
776 |
+
# U+DC00 - U+DFFF
|
777 |
+
elsif ($u =~ /^[dD][c-fC-F][0-9a-fA-F]{2}/) { # UTF-16 low surrogate?
|
778 |
+
unless (defined $utf16) {
|
779 |
+
decode_error("missing high surrogate character in surrogate pair");
|
780 |
+
}
|
781 |
+
$is_utf8 = 1;
|
782 |
+
$s .= JSON_PP_decode_surrogates($utf16, $u) || next;
|
783 |
+
$utf16 = undef;
|
784 |
+
}
|
785 |
+
else {
|
786 |
+
if (defined $utf16) {
|
787 |
+
decode_error("surrogate pair expected");
|
788 |
+
}
|
789 |
+
|
790 |
+
if ( ( my $hex = hex( $u ) ) > 127 ) {
|
791 |
+
$is_utf8 = 1;
|
792 |
+
$s .= JSON_PP_decode_unicode($u) || next;
|
793 |
+
}
|
794 |
+
else {
|
795 |
+
$s .= chr $hex;
|
796 |
+
}
|
797 |
+
}
|
798 |
+
|
799 |
+
}
|
800 |
+
else{
|
801 |
+
unless ($loose) {
|
802 |
+
$at -= 2;
|
803 |
+
decode_error('illegal backslash escape sequence in string');
|
804 |
+
}
|
805 |
+
$s .= $ch;
|
806 |
+
}
|
807 |
+
}
|
808 |
+
else{
|
809 |
+
|
810 |
+
if ( ord $ch > 127 ) {
|
811 |
+
if ( $utf8 ) {
|
812 |
+
unless( $ch = is_valid_utf8($ch) ) {
|
813 |
+
$at -= 1;
|
814 |
+
decode_error("malformed UTF-8 character in JSON string");
|
815 |
+
}
|
816 |
+
else {
|
817 |
+
$at += $utf8_len - 1;
|
818 |
+
}
|
819 |
+
}
|
820 |
+
else {
|
821 |
+
utf8::encode( $ch );
|
822 |
+
}
|
823 |
+
|
824 |
+
$is_utf8 = 1;
|
825 |
+
}
|
826 |
+
|
827 |
+
if (!$loose) {
|
828 |
+
if ($ch =~ /[\x00-\x1f\x22\x5c]/) { # '/' ok
|
829 |
+
$at--;
|
830 |
+
decode_error('invalid character encountered while parsing JSON string');
|
831 |
+
}
|
832 |
+
}
|
833 |
+
|
834 |
+
$s .= $ch;
|
835 |
+
}
|
836 |
+
}
|
837 |
+
}
|
838 |
+
|
839 |
+
decode_error("unexpected end of string while parsing JSON string");
|
840 |
+
}
|
841 |
+
|
842 |
+
|
843 |
+
sub white {
|
844 |
+
while( defined $ch ){
|
845 |
+
if($ch le ' '){
|
846 |
+
next_chr();
|
847 |
+
}
|
848 |
+
elsif($ch eq '/'){
|
849 |
+
next_chr();
|
850 |
+
if(defined $ch and $ch eq '/'){
|
851 |
+
1 while(defined(next_chr()) and $ch ne "\n" and $ch ne "\r");
|
852 |
+
}
|
853 |
+
elsif(defined $ch and $ch eq '*'){
|
854 |
+
next_chr();
|
855 |
+
while(1){
|
856 |
+
if(defined $ch){
|
857 |
+
if($ch eq '*'){
|
858 |
+
if(defined(next_chr()) and $ch eq '/'){
|
859 |
+
next_chr();
|
860 |
+
last;
|
861 |
+
}
|
862 |
+
}
|
863 |
+
else{
|
864 |
+
next_chr();
|
865 |
+
}
|
866 |
+
}
|
867 |
+
else{
|
868 |
+
decode_error("Unterminated comment");
|
869 |
+
}
|
870 |
+
}
|
871 |
+
next;
|
872 |
+
}
|
873 |
+
else{
|
874 |
+
$at--;
|
875 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
876 |
+
}
|
877 |
+
}
|
878 |
+
else{
|
879 |
+
if ($relaxed and $ch eq '#') { # correctly?
|
880 |
+
pos($text) = $at;
|
881 |
+
$text =~ /\G([^\n]*(?:\r\n|\r|\n|$))/g;
|
882 |
+
$at = pos($text);
|
883 |
+
next_chr;
|
884 |
+
next;
|
885 |
+
}
|
886 |
+
|
887 |
+
last;
|
888 |
+
}
|
889 |
+
}
|
890 |
+
}
|
891 |
+
|
892 |
+
|
893 |
+
sub array {
|
894 |
+
my $a = $_[0] || []; # you can use this code to use another array ref object.
|
895 |
+
|
896 |
+
decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
|
897 |
+
if (++$depth > $max_depth);
|
898 |
+
|
899 |
+
next_chr();
|
900 |
+
white();
|
901 |
+
|
902 |
+
if(defined $ch and $ch eq ']'){
|
903 |
+
--$depth;
|
904 |
+
next_chr();
|
905 |
+
return $a;
|
906 |
+
}
|
907 |
+
else {
|
908 |
+
while(defined($ch)){
|
909 |
+
push @$a, value();
|
910 |
+
|
911 |
+
white();
|
912 |
+
|
913 |
+
if (!defined $ch) {
|
914 |
+
last;
|
915 |
+
}
|
916 |
+
|
917 |
+
if($ch eq ']'){
|
918 |
+
--$depth;
|
919 |
+
next_chr();
|
920 |
+
return $a;
|
921 |
+
}
|
922 |
+
|
923 |
+
if($ch ne ','){
|
924 |
+
last;
|
925 |
+
}
|
926 |
+
|
927 |
+
next_chr();
|
928 |
+
white();
|
929 |
+
|
930 |
+
if ($relaxed and $ch eq ']') {
|
931 |
+
--$depth;
|
932 |
+
next_chr();
|
933 |
+
return $a;
|
934 |
+
}
|
935 |
+
|
936 |
+
}
|
937 |
+
}
|
938 |
+
|
939 |
+
decode_error(", or ] expected while parsing array");
|
940 |
+
}
|
941 |
+
|
942 |
+
|
943 |
+
sub object {
|
944 |
+
my $o = $_[0] || {}; # you can use this code to use another hash ref object.
|
945 |
+
my $k;
|
946 |
+
|
947 |
+
decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
|
948 |
+
if (++$depth > $max_depth);
|
949 |
+
next_chr();
|
950 |
+
white();
|
951 |
+
|
952 |
+
if(defined $ch and $ch eq '}'){
|
953 |
+
--$depth;
|
954 |
+
next_chr();
|
955 |
+
if ($F_HOOK) {
|
956 |
+
return _json_object_hook($o);
|
957 |
+
}
|
958 |
+
return $o;
|
959 |
+
}
|
960 |
+
else {
|
961 |
+
while (defined $ch) {
|
962 |
+
$k = ($allow_barekey and $ch ne '"' and $ch ne "'") ? bareKey() : string();
|
963 |
+
white();
|
964 |
+
|
965 |
+
if(!defined $ch or $ch ne ':'){
|
966 |
+
$at--;
|
967 |
+
decode_error("':' expected");
|
968 |
+
}
|
969 |
+
|
970 |
+
next_chr();
|
971 |
+
$o->{$k} = value();
|
972 |
+
white();
|
973 |
+
|
974 |
+
last if (!defined $ch);
|
975 |
+
|
976 |
+
if($ch eq '}'){
|
977 |
+
--$depth;
|
978 |
+
next_chr();
|
979 |
+
if ($F_HOOK) {
|
980 |
+
return _json_object_hook($o);
|
981 |
+
}
|
982 |
+
return $o;
|
983 |
+
}
|
984 |
+
|
985 |
+
if($ch ne ','){
|
986 |
+
last;
|
987 |
+
}
|
988 |
+
|
989 |
+
next_chr();
|
990 |
+
white();
|
991 |
+
|
992 |
+
if ($relaxed and $ch eq '}') {
|
993 |
+
--$depth;
|
994 |
+
next_chr();
|
995 |
+
if ($F_HOOK) {
|
996 |
+
return _json_object_hook($o);
|
997 |
+
}
|
998 |
+
return $o;
|
999 |
+
}
|
1000 |
+
|
1001 |
+
}
|
1002 |
+
|
1003 |
+
}
|
1004 |
+
|
1005 |
+
$at--;
|
1006 |
+
decode_error(", or } expected while parsing object/hash");
|
1007 |
+
}
|
1008 |
+
|
1009 |
+
|
1010 |
+
sub bareKey { # doesn't strictly follow Standard ECMA-262 3rd Edition
|
1011 |
+
my $key;
|
1012 |
+
while($ch =~ /[^\x00-\x23\x25-\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]/){
|
1013 |
+
$key .= $ch;
|
1014 |
+
next_chr();
|
1015 |
+
}
|
1016 |
+
return $key;
|
1017 |
+
}
|
1018 |
+
|
1019 |
+
|
1020 |
+
sub word {
|
1021 |
+
my $word = substr($text,$at-1,4);
|
1022 |
+
|
1023 |
+
if($word eq 'true'){
|
1024 |
+
$at += 3;
|
1025 |
+
next_chr;
|
1026 |
+
return $JSON::PP::true;
|
1027 |
+
}
|
1028 |
+
elsif($word eq 'null'){
|
1029 |
+
$at += 3;
|
1030 |
+
next_chr;
|
1031 |
+
return undef;
|
1032 |
+
}
|
1033 |
+
elsif($word eq 'fals'){
|
1034 |
+
$at += 3;
|
1035 |
+
if(substr($text,$at,1) eq 'e'){
|
1036 |
+
$at++;
|
1037 |
+
next_chr;
|
1038 |
+
return $JSON::PP::false;
|
1039 |
+
}
|
1040 |
+
}
|
1041 |
+
|
1042 |
+
$at--; # for decode_error report
|
1043 |
+
|
1044 |
+
decode_error("'null' expected") if ($word =~ /^n/);
|
1045 |
+
decode_error("'true' expected") if ($word =~ /^t/);
|
1046 |
+
decode_error("'false' expected") if ($word =~ /^f/);
|
1047 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
1048 |
+
}
|
1049 |
+
|
1050 |
+
|
1051 |
+
sub number {
|
1052 |
+
my $n = '';
|
1053 |
+
my $v;
|
1054 |
+
|
1055 |
+
# According to RFC4627, hex or oct digits are invalid.
|
1056 |
+
if($ch eq '0'){
|
1057 |
+
my $peek = substr($text,$at,1);
|
1058 |
+
my $hex = $peek =~ /[xX]/; # 0 or 1
|
1059 |
+
|
1060 |
+
if($hex){
|
1061 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1062 |
+
($n) = ( substr($text, $at+1) =~ /^([0-9a-fA-F]+)/);
|
1063 |
+
}
|
1064 |
+
else{ # oct
|
1065 |
+
($n) = ( substr($text, $at) =~ /^([0-7]+)/);
|
1066 |
+
if (defined $n and length $n > 1) {
|
1067 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1068 |
+
}
|
1069 |
+
}
|
1070 |
+
|
1071 |
+
if(defined $n and length($n)){
|
1072 |
+
if (!$hex and length($n) == 1) {
|
1073 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1074 |
+
}
|
1075 |
+
$at += length($n) + $hex;
|
1076 |
+
next_chr;
|
1077 |
+
return $hex ? hex($n) : oct($n);
|
1078 |
+
}
|
1079 |
+
}
|
1080 |
+
|
1081 |
+
if($ch eq '-'){
|
1082 |
+
$n = '-';
|
1083 |
+
next_chr;
|
1084 |
+
if (!defined $ch or $ch !~ /\d/) {
|
1085 |
+
decode_error("malformed number (no digits after initial minus)");
|
1086 |
+
}
|
1087 |
+
}
|
1088 |
+
|
1089 |
+
while(defined $ch and $ch =~ /\d/){
|
1090 |
+
$n .= $ch;
|
1091 |
+
next_chr;
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
if(defined $ch and $ch eq '.'){
|
1095 |
+
$n .= '.';
|
1096 |
+
|
1097 |
+
next_chr;
|
1098 |
+
if (!defined $ch or $ch !~ /\d/) {
|
1099 |
+
decode_error("malformed number (no digits after decimal point)");
|
1100 |
+
}
|
1101 |
+
else {
|
1102 |
+
$n .= $ch;
|
1103 |
+
}
|
1104 |
+
|
1105 |
+
while(defined(next_chr) and $ch =~ /\d/){
|
1106 |
+
$n .= $ch;
|
1107 |
+
}
|
1108 |
+
}
|
1109 |
+
|
1110 |
+
if(defined $ch and ($ch eq 'e' or $ch eq 'E')){
|
1111 |
+
$n .= $ch;
|
1112 |
+
next_chr;
|
1113 |
+
|
1114 |
+
if(defined($ch) and ($ch eq '+' or $ch eq '-')){
|
1115 |
+
$n .= $ch;
|
1116 |
+
next_chr;
|
1117 |
+
if (!defined $ch or $ch =~ /\D/) {
|
1118 |
+
decode_error("malformed number (no digits after exp sign)");
|
1119 |
+
}
|
1120 |
+
$n .= $ch;
|
1121 |
+
}
|
1122 |
+
elsif(defined($ch) and $ch =~ /\d/){
|
1123 |
+
$n .= $ch;
|
1124 |
+
}
|
1125 |
+
else {
|
1126 |
+
decode_error("malformed number (no digits after exp sign)");
|
1127 |
+
}
|
1128 |
+
|
1129 |
+
while(defined(next_chr) and $ch =~ /\d/){
|
1130 |
+
$n .= $ch;
|
1131 |
+
}
|
1132 |
+
|
1133 |
+
}
|
1134 |
+
|
1135 |
+
$v .= $n;
|
1136 |
+
|
1137 |
+
if ($v !~ /[.eE]/ and length $v > $max_intsize) {
|
1138 |
+
if ($allow_bigint) { # from Adam Sussman
|
1139 |
+
require Math::BigInt;
|
1140 |
+
return Math::BigInt->new($v);
|
1141 |
+
}
|
1142 |
+
else {
|
1143 |
+
return "$v";
|
1144 |
+
}
|
1145 |
+
}
|
1146 |
+
elsif ($allow_bigint) {
|
1147 |
+
require Math::BigFloat;
|
1148 |
+
return Math::BigFloat->new($v);
|
1149 |
+
}
|
1150 |
+
|
1151 |
+
return 0+$v;
|
1152 |
+
}
|
1153 |
+
|
1154 |
+
|
1155 |
+
sub is_valid_utf8 {
|
1156 |
+
|
1157 |
+
$utf8_len = $_[0] =~ /[\x00-\x7F]/ ? 1
|
1158 |
+
: $_[0] =~ /[\xC2-\xDF]/ ? 2
|
1159 |
+
: $_[0] =~ /[\xE0-\xEF]/ ? 3
|
1160 |
+
: $_[0] =~ /[\xF0-\xF4]/ ? 4
|
1161 |
+
: 0
|
1162 |
+
;
|
1163 |
+
|
1164 |
+
return unless $utf8_len;
|
1165 |
+
|
1166 |
+
my $is_valid_utf8 = substr($text, $at - 1, $utf8_len);
|
1167 |
+
|
1168 |
+
return ( $is_valid_utf8 =~ /^(?:
|
1169 |
+
[\x00-\x7F]
|
1170 |
+
|[\xC2-\xDF][\x80-\xBF]
|
1171 |
+
|[\xE0][\xA0-\xBF][\x80-\xBF]
|
1172 |
+
|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
|
1173 |
+
|[\xED][\x80-\x9F][\x80-\xBF]
|
1174 |
+
|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
|
1175 |
+
|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
|
1176 |
+
|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
|
1177 |
+
|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
|
1178 |
+
)$/x ) ? $is_valid_utf8 : '';
|
1179 |
+
}
|
1180 |
+
|
1181 |
+
|
1182 |
+
sub decode_error {
|
1183 |
+
my $error = shift;
|
1184 |
+
my $no_rep = shift;
|
1185 |
+
my $str = defined $text ? substr($text, $at) : '';
|
1186 |
+
my $mess = '';
|
1187 |
+
my $type = $] >= 5.008 ? 'U*'
|
1188 |
+
: $] < 5.006 ? 'C*'
|
1189 |
+
: utf8::is_utf8( $str ) ? 'U*' # 5.6
|
1190 |
+
: 'C*'
|
1191 |
+
;
|
1192 |
+
|
1193 |
+
for my $c ( unpack( $type, $str ) ) { # emulate pv_uni_display() ?
|
1194 |
+
$mess .= $c == 0x07 ? '\a'
|
1195 |
+
: $c == 0x09 ? '\t'
|
1196 |
+
: $c == 0x0a ? '\n'
|
1197 |
+
: $c == 0x0d ? '\r'
|
1198 |
+
: $c == 0x0c ? '\f'
|
1199 |
+
: $c < 0x20 ? sprintf('\x{%x}', $c)
|
1200 |
+
: $c == 0x5c ? '\\\\'
|
1201 |
+
: $c < 0x80 ? chr($c)
|
1202 |
+
: sprintf('\x{%x}', $c)
|
1203 |
+
;
|
1204 |
+
if ( length $mess >= 20 ) {
|
1205 |
+
$mess .= '...';
|
1206 |
+
last;
|
1207 |
+
}
|
1208 |
+
}
|
1209 |
+
|
1210 |
+
unless ( length $mess ) {
|
1211 |
+
$mess = '(end of string)';
|
1212 |
+
}
|
1213 |
+
|
1214 |
+
Carp::croak (
|
1215 |
+
$no_rep ? "$error" : "$error, at character offset $at (before \"$mess\")"
|
1216 |
+
);
|
1217 |
+
|
1218 |
+
}
|
1219 |
+
|
1220 |
+
|
1221 |
+
sub _json_object_hook {
|
1222 |
+
my $o = $_[0];
|
1223 |
+
my @ks = keys %{$o};
|
1224 |
+
|
1225 |
+
if ( $cb_sk_object and @ks == 1 and exists $cb_sk_object->{ $ks[0] } and ref $cb_sk_object->{ $ks[0] } ) {
|
1226 |
+
my @val = $cb_sk_object->{ $ks[0] }->( $o->{$ks[0]} );
|
1227 |
+
if (@val == 1) {
|
1228 |
+
return $val[0];
|
1229 |
+
}
|
1230 |
+
}
|
1231 |
+
|
1232 |
+
my @val = $cb_object->($o) if ($cb_object);
|
1233 |
+
if (@val == 0 or @val > 1) {
|
1234 |
+
return $o;
|
1235 |
+
}
|
1236 |
+
else {
|
1237 |
+
return $val[0];
|
1238 |
+
}
|
1239 |
+
}
|
1240 |
+
|
1241 |
+
|
1242 |
+
sub PP_decode_box {
|
1243 |
+
{
|
1244 |
+
text => $text,
|
1245 |
+
at => $at,
|
1246 |
+
ch => $ch,
|
1247 |
+
len => $len,
|
1248 |
+
depth => $depth,
|
1249 |
+
encoding => $encoding,
|
1250 |
+
is_valid_utf8 => $is_valid_utf8,
|
1251 |
+
};
|
1252 |
+
}
|
1253 |
+
|
1254 |
+
} # PARSE
|
1255 |
+
|
1256 |
+
|
1257 |
+
sub _decode_surrogates { # from perlunicode
|
1258 |
+
my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00);
|
1259 |
+
my $un = pack('U*', $uni);
|
1260 |
+
utf8::encode( $un );
|
1261 |
+
return $un;
|
1262 |
+
}
|
1263 |
+
|
1264 |
+
|
1265 |
+
sub _decode_unicode {
|
1266 |
+
my $un = pack('U', hex shift);
|
1267 |
+
utf8::encode( $un );
|
1268 |
+
return $un;
|
1269 |
+
}
|
1270 |
+
|
1271 |
+
#
|
1272 |
+
# Setup for various Perl versions (the code from JSON::PP58)
|
1273 |
+
#
|
1274 |
+
|
1275 |
+
BEGIN {
|
1276 |
+
|
1277 |
+
unless ( defined &utf8::is_utf8 ) {
|
1278 |
+
require Encode;
|
1279 |
+
*utf8::is_utf8 = *Encode::is_utf8;
|
1280 |
+
}
|
1281 |
+
|
1282 |
+
if ( $] >= 5.008 ) {
|
1283 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
1284 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
1285 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
|
1286 |
+
*JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
|
1287 |
+
}
|
1288 |
+
|
1289 |
+
if ($] >= 5.008 and $] < 5.008003) { # join() in 5.8.0 - 5.8.2 is broken.
|
1290 |
+
package # hide from PAUSE
|
1291 |
+
JSON::PP;
|
1292 |
+
require subs;
|
1293 |
+
subs->import('join');
|
1294 |
+
eval q|
|
1295 |
+
sub join {
|
1296 |
+
return '' if (@_ < 2);
|
1297 |
+
my $j = shift;
|
1298 |
+
my $str = shift;
|
1299 |
+
for (@_) { $str .= $j . $_; }
|
1300 |
+
return $str;
|
1301 |
+
}
|
1302 |
+
|;
|
1303 |
+
}
|
1304 |
+
|
1305 |
+
|
1306 |
+
sub JSON::PP::incr_parse {
|
1307 |
+
local $Carp::CarpLevel = 1;
|
1308 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_parse( @_ );
|
1309 |
+
}
|
1310 |
+
|
1311 |
+
|
1312 |
+
sub JSON::PP::incr_skip {
|
1313 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_skip;
|
1314 |
+
}
|
1315 |
+
|
1316 |
+
|
1317 |
+
sub JSON::PP::incr_reset {
|
1318 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_reset;
|
1319 |
+
}
|
1320 |
+
|
1321 |
+
eval q{
|
1322 |
+
sub JSON::PP::incr_text : lvalue {
|
1323 |
+
$_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
|
1324 |
+
|
1325 |
+
if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
|
1326 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
1327 |
+
}
|
1328 |
+
$_[0]->{_incr_parser}->{incr_text};
|
1329 |
+
}
|
1330 |
+
} if ( $] >= 5.006 );
|
1331 |
+
|
1332 |
+
} # Setup for various Perl versions (the code from JSON::PP58)
|
1333 |
+
|
1334 |
+
|
1335 |
+
###############################
|
1336 |
+
# Utilities
|
1337 |
+
#
|
1338 |
+
|
1339 |
+
BEGIN {
|
1340 |
+
eval 'require Scalar::Util';
|
1341 |
+
unless($@){
|
1342 |
+
*JSON::PP::blessed = \&Scalar::Util::blessed;
|
1343 |
+
*JSON::PP::reftype = \&Scalar::Util::reftype;
|
1344 |
+
*JSON::PP::refaddr = \&Scalar::Util::refaddr;
|
1345 |
+
}
|
1346 |
+
else{ # This code is from Scalar::Util.
|
1347 |
+
# warn $@;
|
1348 |
+
eval 'sub UNIVERSAL::a_sub_not_likely_to_be_here { ref($_[0]) }';
|
1349 |
+
*JSON::PP::blessed = sub {
|
1350 |
+
local($@, $SIG{__DIE__}, $SIG{__WARN__});
|
1351 |
+
ref($_[0]) ? eval { $_[0]->a_sub_not_likely_to_be_here } : undef;
|
1352 |
+
};
|
1353 |
+
my %tmap = qw(
|
1354 |
+
B::NULL SCALAR
|
1355 |
+
B::HV HASH
|
1356 |
+
B::AV ARRAY
|
1357 |
+
B::CV CODE
|
1358 |
+
B::IO IO
|
1359 |
+
B::GV GLOB
|
1360 |
+
B::REGEXP REGEXP
|
1361 |
+
);
|
1362 |
+
*JSON::PP::reftype = sub {
|
1363 |
+
my $r = shift;
|
1364 |
+
|
1365 |
+
return undef unless length(ref($r));
|
1366 |
+
|
1367 |
+
my $t = ref(B::svref_2object($r));
|
1368 |
+
|
1369 |
+
return
|
1370 |
+
exists $tmap{$t} ? $tmap{$t}
|
1371 |
+
: length(ref($$r)) ? 'REF'
|
1372 |
+
: 'SCALAR';
|
1373 |
+
};
|
1374 |
+
*JSON::PP::refaddr = sub {
|
1375 |
+
return undef unless length(ref($_[0]));
|
1376 |
+
|
1377 |
+
my $addr;
|
1378 |
+
if(defined(my $pkg = blessed($_[0]))) {
|
1379 |
+
$addr .= bless $_[0], 'Scalar::Util::Fake';
|
1380 |
+
bless $_[0], $pkg;
|
1381 |
+
}
|
1382 |
+
else {
|
1383 |
+
$addr .= $_[0]
|
1384 |
+
}
|
1385 |
+
|
1386 |
+
$addr =~ /0x(\w+)/;
|
1387 |
+
local $^W;
|
1388 |
+
#no warnings 'portable';
|
1389 |
+
hex($1);
|
1390 |
+
}
|
1391 |
+
}
|
1392 |
+
}
|
1393 |
+
|
1394 |
+
|
1395 |
+
# shamelessly copied and modified from JSON::XS code.
|
1396 |
+
|
1397 |
+
unless ( $INC{'JSON/PP.pm'} ) {
|
1398 |
+
eval q|
|
1399 |
+
package
|
1400 |
+
JSON::PP::Boolean;
|
1401 |
+
|
1402 |
+
use overload (
|
1403 |
+
"0+" => sub { ${$_[0]} },
|
1404 |
+
"++" => sub { $_[0] = ${$_[0]} + 1 },
|
1405 |
+
"--" => sub { $_[0] = ${$_[0]} - 1 },
|
1406 |
+
fallback => 1,
|
1407 |
+
);
|
1408 |
+
|;
|
1409 |
+
}
|
1410 |
+
|
1411 |
+
$JSON::PP::true = do { bless \(my $dummy = 1), "JSON::PP::Boolean" };
|
1412 |
+
$JSON::PP::false = do { bless \(my $dummy = 0), "JSON::PP::Boolean" };
|
1413 |
+
|
1414 |
+
sub is_bool { defined $_[0] and UNIVERSAL::isa($_[0], "JSON::PP::Boolean"); }
|
1415 |
+
|
1416 |
+
sub true { $JSON::PP::true }
|
1417 |
+
sub false { $JSON::PP::false }
|
1418 |
+
sub null { undef; }
|
1419 |
+
|
1420 |
+
###############################
|
1421 |
+
|
1422 |
+
###############################
|
1423 |
+
|
1424 |
+
package # hide from PAUSE
|
1425 |
+
JSON::PP::IncrParser;
|
1426 |
+
|
1427 |
+
use strict;
|
1428 |
+
|
1429 |
+
use constant INCR_M_WS => 0; # initial whitespace skipping
|
1430 |
+
use constant INCR_M_STR => 1; # inside string
|
1431 |
+
use constant INCR_M_BS => 2; # inside backslash
|
1432 |
+
use constant INCR_M_JSON => 3; # outside anything, count nesting
|
1433 |
+
use constant INCR_M_C0 => 4;
|
1434 |
+
use constant INCR_M_C1 => 5;
|
1435 |
+
|
1436 |
+
use vars qw($VERSION);
|
1437 |
+
$VERSION = '1.01';
|
1438 |
+
|
1439 |
+
my $unpack_format = $] < 5.006 ? 'C*' : 'U*';
|
1440 |
+
|
1441 |
+
sub new {
|
1442 |
+
my ( $class ) = @_;
|
1443 |
+
|
1444 |
+
bless {
|
1445 |
+
incr_nest => 0,
|
1446 |
+
incr_text => undef,
|
1447 |
+
incr_parsing => 0,
|
1448 |
+
incr_p => 0,
|
1449 |
+
}, $class;
|
1450 |
+
}
|
1451 |
+
|
1452 |
+
|
1453 |
+
sub incr_parse {
|
1454 |
+
my ( $self, $coder, $text ) = @_;
|
1455 |
+
|
1456 |
+
$self->{incr_text} = '' unless ( defined $self->{incr_text} );
|
1457 |
+
|
1458 |
+
if ( defined $text ) {
|
1459 |
+
if ( utf8::is_utf8( $text ) and !utf8::is_utf8( $self->{incr_text} ) ) {
|
1460 |
+
utf8::upgrade( $self->{incr_text} ) ;
|
1461 |
+
utf8::decode( $self->{incr_text} ) ;
|
1462 |
+
}
|
1463 |
+
$self->{incr_text} .= $text;
|
1464 |
+
}
|
1465 |
+
|
1466 |
+
|
1467 |
+
my $max_size = $coder->get_max_size;
|
1468 |
+
|
1469 |
+
if ( defined wantarray ) {
|
1470 |
+
|
1471 |
+
$self->{incr_mode} = INCR_M_WS unless defined $self->{incr_mode};
|
1472 |
+
|
1473 |
+
if ( wantarray ) {
|
1474 |
+
my @ret;
|
1475 |
+
|
1476 |
+
$self->{incr_parsing} = 1;
|
1477 |
+
|
1478 |
+
do {
|
1479 |
+
push @ret, $self->_incr_parse( $coder, $self->{incr_text} );
|
1480 |
+
|
1481 |
+
unless ( !$self->{incr_nest} and $self->{incr_mode} == INCR_M_JSON ) {
|
1482 |
+
$self->{incr_mode} = INCR_M_WS if $self->{incr_mode} != INCR_M_STR;
|
1483 |
+
}
|
1484 |
+
|
1485 |
+
} until ( length $self->{incr_text} >= $self->{incr_p} );
|
1486 |
+
|
1487 |
+
$self->{incr_parsing} = 0;
|
1488 |
+
|
1489 |
+
return @ret;
|
1490 |
+
}
|
1491 |
+
else { # in scalar context
|
1492 |
+
$self->{incr_parsing} = 1;
|
1493 |
+
my $obj = $self->_incr_parse( $coder, $self->{incr_text} );
|
1494 |
+
$self->{incr_parsing} = 0 if defined $obj; # pointed by Martin J. Evans
|
1495 |
+
return $obj ? $obj : undef; # $obj is an empty string, parsing was completed.
|
1496 |
+
}
|
1497 |
+
|
1498 |
+
}
|
1499 |
+
|
1500 |
+
}
|
1501 |
+
|
1502 |
+
|
1503 |
+
sub _incr_parse {
|
1504 |
+
my ( $self, $coder, $text, $skip ) = @_;
|
1505 |
+
my $p = $self->{incr_p};
|
1506 |
+
my $restore = $p;
|
1507 |
+
|
1508 |
+
my @obj;
|
1509 |
+
my $len = length $text;
|
1510 |
+
|
1511 |
+
if ( $self->{incr_mode} == INCR_M_WS ) {
|
1512 |
+
while ( $len > $p ) {
|
1513 |
+
my $s = substr( $text, $p, 1 );
|
1514 |
+
$p++ and next if ( 0x20 >= unpack($unpack_format, $s) );
|
1515 |
+
$self->{incr_mode} = INCR_M_JSON;
|
1516 |
+
last;
|
1517 |
+
}
|
1518 |
+
}
|
1519 |
+
|
1520 |
+
while ( $len > $p ) {
|
1521 |
+
my $s = substr( $text, $p++, 1 );
|
1522 |
+
|
1523 |
+
if ( $s eq '"' ) {
|
1524 |
+
if (substr( $text, $p - 2, 1 ) eq '\\' ) {
|
1525 |
+
next;
|
1526 |
+
}
|
1527 |
+
|
1528 |
+
if ( $self->{incr_mode} != INCR_M_STR ) {
|
1529 |
+
$self->{incr_mode} = INCR_M_STR;
|
1530 |
+
}
|
1531 |
+
else {
|
1532 |
+
$self->{incr_mode} = INCR_M_JSON;
|
1533 |
+
unless ( $self->{incr_nest} ) {
|
1534 |
+
last;
|
1535 |
+
}
|
1536 |
+
}
|
1537 |
+
}
|
1538 |
+
|
1539 |
+
if ( $self->{incr_mode} == INCR_M_JSON ) {
|
1540 |
+
|
1541 |
+
if ( $s eq '[' or $s eq '{' ) {
|
1542 |
+
if ( ++$self->{incr_nest} > $coder->get_max_depth ) {
|
1543 |
+
Carp::croak('json text or perl structure exceeds maximum nesting level (max_depth set too low?)');
|
1544 |
+
}
|
1545 |
+
}
|
1546 |
+
elsif ( $s eq ']' or $s eq '}' ) {
|
1547 |
+
last if ( --$self->{incr_nest} <= 0 );
|
1548 |
+
}
|
1549 |
+
elsif ( $s eq '#' ) {
|
1550 |
+
while ( $len > $p ) {
|
1551 |
+
last if substr( $text, $p++, 1 ) eq "\n";
|
1552 |
+
}
|
1553 |
+
}
|
1554 |
+
|
1555 |
+
}
|
1556 |
+
|
1557 |
+
}
|
1558 |
+
|
1559 |
+
$self->{incr_p} = $p;
|
1560 |
+
|
1561 |
+
return if ( $self->{incr_mode} == INCR_M_STR and not $self->{incr_nest} );
|
1562 |
+
return if ( $self->{incr_mode} == INCR_M_JSON and $self->{incr_nest} > 0 );
|
1563 |
+
|
1564 |
+
return '' unless ( length substr( $self->{incr_text}, 0, $p ) );
|
1565 |
+
|
1566 |
+
local $Carp::CarpLevel = 2;
|
1567 |
+
|
1568 |
+
$self->{incr_p} = $restore;
|
1569 |
+
$self->{incr_c} = $p;
|
1570 |
+
|
1571 |
+
my ( $obj, $tail ) = $coder->PP_decode_json( substr( $self->{incr_text}, 0, $p ), 0x10000001 );
|
1572 |
+
|
1573 |
+
$self->{incr_text} = substr( $self->{incr_text}, $p );
|
1574 |
+
$self->{incr_p} = 0;
|
1575 |
+
|
1576 |
+
return $obj || '';
|
1577 |
+
}
|
1578 |
+
|
1579 |
+
|
1580 |
+
sub incr_text {
|
1581 |
+
if ( $_[0]->{incr_parsing} ) {
|
1582 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
1583 |
+
}
|
1584 |
+
$_[0]->{incr_text};
|
1585 |
+
}
|
1586 |
+
|
1587 |
+
|
1588 |
+
sub incr_skip {
|
1589 |
+
my $self = shift;
|
1590 |
+
$self->{incr_text} = substr( $self->{incr_text}, $self->{incr_c} );
|
1591 |
+
$self->{incr_p} = 0;
|
1592 |
+
}
|
1593 |
+
|
1594 |
+
|
1595 |
+
sub incr_reset {
|
1596 |
+
my $self = shift;
|
1597 |
+
$self->{incr_text} = undef;
|
1598 |
+
$self->{incr_p} = 0;
|
1599 |
+
$self->{incr_mode} = 0;
|
1600 |
+
$self->{incr_nest} = 0;
|
1601 |
+
$self->{incr_parsing} = 0;
|
1602 |
+
}
|
1603 |
+
|
1604 |
+
###############################
|
1605 |
+
|
1606 |
+
|
1607 |
+
1;
|
1608 |
+
__END__
|
1609 |
+
=pod
|
1610 |
+
|
1611 |
+
=head1 NAME
|
1612 |
+
|
1613 |
+
JSON::PP - JSON::XS compatible pure-Perl module.
|
1614 |
+
|
1615 |
+
=head1 SYNOPSIS
|
1616 |
+
|
1617 |
+
use JSON::PP;
|
1618 |
+
|
1619 |
+
# exported functions, they croak on error
|
1620 |
+
# and expect/generate UTF-8
|
1621 |
+
|
1622 |
+
$utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
|
1623 |
+
$perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
|
1624 |
+
|
1625 |
+
# OO-interface
|
1626 |
+
|
1627 |
+
$coder = JSON::PP->new->ascii->pretty->allow_nonref;
|
1628 |
+
|
1629 |
+
$json_text = $json->encode( $perl_scalar );
|
1630 |
+
$perl_scalar = $json->decode( $json_text );
|
1631 |
+
|
1632 |
+
$pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
|
1633 |
+
|
1634 |
+
# Note that JSON version 2.0 and above will automatically use
|
1635 |
+
# JSON::XS or JSON::PP, so you should be able to just:
|
1636 |
+
|
1637 |
+
use JSON;
|
1638 |
+
|
1639 |
+
|
1640 |
+
=head1 VERSION
|
1641 |
+
|
1642 |
+
2.27200
|
1643 |
+
|
1644 |
+
L<JSON::XS> 2.27 (~2.30) compatible.
|
1645 |
+
|
1646 |
+
=head1 DESCRIPTION
|
1647 |
+
|
1648 |
+
This module is L<JSON::XS> compatible pure Perl module.
|
1649 |
+
(Perl 5.8 or later is recommended)
|
1650 |
+
|
1651 |
+
JSON::XS is the fastest and most proper JSON module on CPAN.
|
1652 |
+
It is written by Marc Lehmann in C, so must be compiled and
|
1653 |
+
installed in the used environment.
|
1654 |
+
|
1655 |
+
JSON::PP is a pure-Perl module and has compatibility to JSON::XS.
|
1656 |
+
|
1657 |
+
|
1658 |
+
=head2 FEATURES
|
1659 |
+
|
1660 |
+
=over
|
1661 |
+
|
1662 |
+
=item * correct unicode handling
|
1663 |
+
|
1664 |
+
This module knows how to handle Unicode (depending on Perl version).
|
1665 |
+
|
1666 |
+
See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL> and
|
1667 |
+
L<UNICODE HANDLING ON PERLS>.
|
1668 |
+
|
1669 |
+
|
1670 |
+
=item * round-trip integrity
|
1671 |
+
|
1672 |
+
When you serialise a perl data structure using only data types
|
1673 |
+
supported by JSON and Perl, the deserialised data structure is
|
1674 |
+
identical on the Perl level. (e.g. the string "2.0" doesn't suddenly
|
1675 |
+
become "2" just because it looks like a number). There I<are> minor
|
1676 |
+
exceptions to this, read the MAPPING section below to learn about
|
1677 |
+
those.
|
1678 |
+
|
1679 |
+
|
1680 |
+
=item * strict checking of JSON correctness
|
1681 |
+
|
1682 |
+
There is no guessing, no generating of illegal JSON texts by default,
|
1683 |
+
and only JSON is accepted as input by default (the latter is a
|
1684 |
+
security feature). But when some options are set, loose checking
|
1685 |
+
features are available.
|
1686 |
+
|
1687 |
+
=back
|
1688 |
+
|
1689 |
+
=head1 FUNCTIONAL INTERFACE
|
1690 |
+
|
1691 |
+
Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
|
1692 |
+
|
1693 |
+
=head2 encode_json
|
1694 |
+
|
1695 |
+
$json_text = encode_json $perl_scalar
|
1696 |
+
|
1697 |
+
Converts the given Perl data structure to a UTF-8 encoded, binary string.
|
1698 |
+
|
1699 |
+
This function call is functionally identical to:
|
1700 |
+
|
1701 |
+
$json_text = JSON::PP->new->utf8->encode($perl_scalar)
|
1702 |
+
|
1703 |
+
=head2 decode_json
|
1704 |
+
|
1705 |
+
$perl_scalar = decode_json $json_text
|
1706 |
+
|
1707 |
+
The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
|
1708 |
+
to parse that as an UTF-8 encoded JSON text, returning the resulting
|
1709 |
+
reference.
|
1710 |
+
|
1711 |
+
This function call is functionally identical to:
|
1712 |
+
|
1713 |
+
$perl_scalar = JSON::PP->new->utf8->decode($json_text)
|
1714 |
+
|
1715 |
+
=head2 JSON::PP::is_bool
|
1716 |
+
|
1717 |
+
$is_boolean = JSON::PP::is_bool($scalar)
|
1718 |
+
|
1719 |
+
Returns true if the passed scalar represents either JSON::PP::true or
|
1720 |
+
JSON::PP::false, two constants that act like C<1> and C<0> respectively
|
1721 |
+
and are also used to represent JSON C<true> and C<false> in Perl strings.
|
1722 |
+
|
1723 |
+
=head2 JSON::PP::true
|
1724 |
+
|
1725 |
+
Returns JSON true value which is blessed object.
|
1726 |
+
It C<isa> JSON::PP::Boolean object.
|
1727 |
+
|
1728 |
+
=head2 JSON::PP::false
|
1729 |
+
|
1730 |
+
Returns JSON false value which is blessed object.
|
1731 |
+
It C<isa> JSON::PP::Boolean object.
|
1732 |
+
|
1733 |
+
=head2 JSON::PP::null
|
1734 |
+
|
1735 |
+
Returns C<undef>.
|
1736 |
+
|
1737 |
+
See L<MAPPING>, below, for more information on how JSON values are mapped to
|
1738 |
+
Perl.
|
1739 |
+
|
1740 |
+
|
1741 |
+
=head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
|
1742 |
+
|
1743 |
+
This section supposes that your perl version is 5.8 or later.
|
1744 |
+
|
1745 |
+
If you know a JSON text from an outer world - a network, a file content, and so on,
|
1746 |
+
is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
|
1747 |
+
with C<utf8> enable. And the decoded result will contain UNICODE characters.
|
1748 |
+
|
1749 |
+
# from network
|
1750 |
+
my $json = JSON::PP->new->utf8;
|
1751 |
+
my $json_text = CGI->new->param( 'json_data' );
|
1752 |
+
my $perl_scalar = $json->decode( $json_text );
|
1753 |
+
|
1754 |
+
# from file content
|
1755 |
+
local $/;
|
1756 |
+
open( my $fh, '<', 'json.data' );
|
1757 |
+
$json_text = <$fh>;
|
1758 |
+
$perl_scalar = decode_json( $json_text );
|
1759 |
+
|
1760 |
+
If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
|
1761 |
+
|
1762 |
+
use Encode;
|
1763 |
+
local $/;
|
1764 |
+
open( my $fh, '<', 'json.data' );
|
1765 |
+
my $encoding = 'cp932';
|
1766 |
+
my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
|
1767 |
+
|
1768 |
+
# or you can write the below code.
|
1769 |
+
#
|
1770 |
+
# open( my $fh, "<:encoding($encoding)", 'json.data' );
|
1771 |
+
# $unicode_json_text = <$fh>;
|
1772 |
+
|
1773 |
+
In this case, C<$unicode_json_text> is of course UNICODE string.
|
1774 |
+
So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
|
1775 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable.
|
1776 |
+
|
1777 |
+
$perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
|
1778 |
+
|
1779 |
+
Or C<encode 'utf8'> and C<decode_json>:
|
1780 |
+
|
1781 |
+
$perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
|
1782 |
+
# this way is not efficient.
|
1783 |
+
|
1784 |
+
And now, you want to convert your C<$perl_scalar> into JSON data and
|
1785 |
+
send it to an outer world - a network or a file content, and so on.
|
1786 |
+
|
1787 |
+
Your data usually contains UNICODE strings and you want the converted data to be encoded
|
1788 |
+
in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
|
1789 |
+
|
1790 |
+
print encode_json( $perl_scalar ); # to a network? file? or display?
|
1791 |
+
# or
|
1792 |
+
print $json->utf8->encode( $perl_scalar );
|
1793 |
+
|
1794 |
+
If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
|
1795 |
+
for some reason, then its characters are regarded as B<latin1> for perl
|
1796 |
+
(because it does not concern with your $encoding).
|
1797 |
+
You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
|
1798 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable.
|
1799 |
+
Note that the resulted text is a UNICODE string but no problem to print it.
|
1800 |
+
|
1801 |
+
# $perl_scalar contains $encoding encoded string values
|
1802 |
+
$unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
|
1803 |
+
# $unicode_json_text consists of characters less than 0x100
|
1804 |
+
print $unicode_json_text;
|
1805 |
+
|
1806 |
+
Or C<decode $encoding> all string values and C<encode_json>:
|
1807 |
+
|
1808 |
+
$perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
|
1809 |
+
# ... do it to each string values, then encode_json
|
1810 |
+
$json_text = encode_json( $perl_scalar );
|
1811 |
+
|
1812 |
+
This method is a proper way but probably not efficient.
|
1813 |
+
|
1814 |
+
See to L<Encode>, L<perluniintro>.
|
1815 |
+
|
1816 |
+
|
1817 |
+
=head1 METHODS
|
1818 |
+
|
1819 |
+
Basically, check to L<JSON> or L<JSON::XS>.
|
1820 |
+
|
1821 |
+
=head2 new
|
1822 |
+
|
1823 |
+
$json = JSON::PP->new
|
1824 |
+
|
1825 |
+
Returns a new JSON::PP object that can be used to de/encode JSON
|
1826 |
+
strings.
|
1827 |
+
|
1828 |
+
All boolean flags described below are by default I<disabled>.
|
1829 |
+
|
1830 |
+
The mutators for flags all return the JSON object again and thus calls can
|
1831 |
+
be chained:
|
1832 |
+
|
1833 |
+
my $json = JSON::PP->new->utf8->space_after->encode({a => [1,2]})
|
1834 |
+
=> {"a": [1, 2]}
|
1835 |
+
|
1836 |
+
=head2 ascii
|
1837 |
+
|
1838 |
+
$json = $json->ascii([$enable])
|
1839 |
+
|
1840 |
+
$enabled = $json->get_ascii
|
1841 |
+
|
1842 |
+
If $enable is true (or missing), then the encode method will not generate characters outside
|
1843 |
+
the code range 0..127. Any Unicode characters outside that range will be escaped using either
|
1844 |
+
a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
|
1845 |
+
(See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>).
|
1846 |
+
|
1847 |
+
In Perl 5.005, there is no character having high value (more than 255).
|
1848 |
+
See to L<UNICODE HANDLING ON PERLS>.
|
1849 |
+
|
1850 |
+
If $enable is false, then the encode method will not escape Unicode characters unless
|
1851 |
+
required by the JSON syntax or other flags. This results in a faster and more compact format.
|
1852 |
+
|
1853 |
+
JSON::PP->new->ascii(1)->encode([chr 0x10401])
|
1854 |
+
=> ["\ud801\udc01"]
|
1855 |
+
|
1856 |
+
=head2 latin1
|
1857 |
+
|
1858 |
+
$json = $json->latin1([$enable])
|
1859 |
+
|
1860 |
+
$enabled = $json->get_latin1
|
1861 |
+
|
1862 |
+
If $enable is true (or missing), then the encode method will encode the resulting JSON
|
1863 |
+
text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
|
1864 |
+
|
1865 |
+
If $enable is false, then the encode method will not escape Unicode characters
|
1866 |
+
unless required by the JSON syntax or other flags.
|
1867 |
+
|
1868 |
+
JSON::XS->new->latin1->encode (["\x{89}\x{abc}"]
|
1869 |
+
=> ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
|
1870 |
+
|
1871 |
+
See to L<UNICODE HANDLING ON PERLS>.
|
1872 |
+
|
1873 |
+
=head2 utf8
|
1874 |
+
|
1875 |
+
$json = $json->utf8([$enable])
|
1876 |
+
|
1877 |
+
$enabled = $json->get_utf8
|
1878 |
+
|
1879 |
+
If $enable is true (or missing), then the encode method will encode the JSON result
|
1880 |
+
into UTF-8, as required by many protocols, while the decode method expects to be handled
|
1881 |
+
an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
|
1882 |
+
characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
|
1883 |
+
|
1884 |
+
(In Perl 5.005, any character outside the range 0..255 does not exist.
|
1885 |
+
See to L<UNICODE HANDLING ON PERLS>.)
|
1886 |
+
|
1887 |
+
In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
|
1888 |
+
encoding families, as described in RFC4627.
|
1889 |
+
|
1890 |
+
If $enable is false, then the encode method will return the JSON string as a (non-encoded)
|
1891 |
+
Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
|
1892 |
+
(e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
|
1893 |
+
|
1894 |
+
Example, output UTF-16BE-encoded JSON:
|
1895 |
+
|
1896 |
+
use Encode;
|
1897 |
+
$jsontext = encode "UTF-16BE", JSON::PP->new->encode ($object);
|
1898 |
+
|
1899 |
+
Example, decode UTF-32LE-encoded JSON:
|
1900 |
+
|
1901 |
+
use Encode;
|
1902 |
+
$object = JSON::PP->new->decode (decode "UTF-32LE", $jsontext);
|
1903 |
+
|
1904 |
+
|
1905 |
+
=head2 pretty
|
1906 |
+
|
1907 |
+
$json = $json->pretty([$enable])
|
1908 |
+
|
1909 |
+
This enables (or disables) all of the C<indent>, C<space_before> and
|
1910 |
+
C<space_after> flags in one call to generate the most readable
|
1911 |
+
(or most compact) form possible.
|
1912 |
+
|
1913 |
+
Equivalent to:
|
1914 |
+
|
1915 |
+
$json->indent->space_before->space_after
|
1916 |
+
|
1917 |
+
=head2 indent
|
1918 |
+
|
1919 |
+
$json = $json->indent([$enable])
|
1920 |
+
|
1921 |
+
$enabled = $json->get_indent
|
1922 |
+
|
1923 |
+
The default indent space length is three.
|
1924 |
+
You can use C<indent_length> to change the length.
|
1925 |
+
|
1926 |
+
=head2 space_before
|
1927 |
+
|
1928 |
+
$json = $json->space_before([$enable])
|
1929 |
+
|
1930 |
+
$enabled = $json->get_space_before
|
1931 |
+
|
1932 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1933 |
+
optional space before the C<:> separating keys from values in JSON objects.
|
1934 |
+
|
1935 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1936 |
+
space at those places.
|
1937 |
+
|
1938 |
+
This setting has no effect when decoding JSON texts.
|
1939 |
+
|
1940 |
+
Example, space_before enabled, space_after and indent disabled:
|
1941 |
+
|
1942 |
+
{"key" :"value"}
|
1943 |
+
|
1944 |
+
=head2 space_after
|
1945 |
+
|
1946 |
+
$json = $json->space_after([$enable])
|
1947 |
+
|
1948 |
+
$enabled = $json->get_space_after
|
1949 |
+
|
1950 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1951 |
+
optional space after the C<:> separating keys from values in JSON objects
|
1952 |
+
and extra whitespace after the C<,> separating key-value pairs and array
|
1953 |
+
members.
|
1954 |
+
|
1955 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1956 |
+
space at those places.
|
1957 |
+
|
1958 |
+
This setting has no effect when decoding JSON texts.
|
1959 |
+
|
1960 |
+
Example, space_before and indent disabled, space_after enabled:
|
1961 |
+
|
1962 |
+
{"key": "value"}
|
1963 |
+
|
1964 |
+
=head2 relaxed
|
1965 |
+
|
1966 |
+
$json = $json->relaxed([$enable])
|
1967 |
+
|
1968 |
+
$enabled = $json->get_relaxed
|
1969 |
+
|
1970 |
+
If C<$enable> is true (or missing), then C<decode> will accept some
|
1971 |
+
extensions to normal JSON syntax (see below). C<encode> will not be
|
1972 |
+
affected in anyway. I<Be aware that this option makes you accept invalid
|
1973 |
+
JSON texts as if they were valid!>. I suggest only to use this option to
|
1974 |
+
parse application-specific files written by humans (configuration files,
|
1975 |
+
resource files etc.)
|
1976 |
+
|
1977 |
+
If C<$enable> is false (the default), then C<decode> will only accept
|
1978 |
+
valid JSON texts.
|
1979 |
+
|
1980 |
+
Currently accepted extensions are:
|
1981 |
+
|
1982 |
+
=over 4
|
1983 |
+
|
1984 |
+
=item * list items can have an end-comma
|
1985 |
+
|
1986 |
+
JSON I<separates> array elements and key-value pairs with commas. This
|
1987 |
+
can be annoying if you write JSON texts manually and want to be able to
|
1988 |
+
quickly append elements, so this extension accepts comma at the end of
|
1989 |
+
such items not just between them:
|
1990 |
+
|
1991 |
+
[
|
1992 |
+
1,
|
1993 |
+
2, <- this comma not normally allowed
|
1994 |
+
]
|
1995 |
+
{
|
1996 |
+
"k1": "v1",
|
1997 |
+
"k2": "v2", <- this comma not normally allowed
|
1998 |
+
}
|
1999 |
+
|
2000 |
+
=item * shell-style '#'-comments
|
2001 |
+
|
2002 |
+
Whenever JSON allows whitespace, shell-style comments are additionally
|
2003 |
+
allowed. They are terminated by the first carriage-return or line-feed
|
2004 |
+
character, after which more white-space and comments are allowed.
|
2005 |
+
|
2006 |
+
[
|
2007 |
+
1, # this comment not allowed in JSON
|
2008 |
+
# neither this one...
|
2009 |
+
]
|
2010 |
+
|
2011 |
+
=back
|
2012 |
+
|
2013 |
+
=head2 canonical
|
2014 |
+
|
2015 |
+
$json = $json->canonical([$enable])
|
2016 |
+
|
2017 |
+
$enabled = $json->get_canonical
|
2018 |
+
|
2019 |
+
If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
|
2020 |
+
by sorting their keys. This is adding a comparatively high overhead.
|
2021 |
+
|
2022 |
+
If C<$enable> is false, then the C<encode> method will output key-value
|
2023 |
+
pairs in the order Perl stores them (which will likely change between runs
|
2024 |
+
of the same script).
|
2025 |
+
|
2026 |
+
This option is useful if you want the same data structure to be encoded as
|
2027 |
+
the same JSON text (given the same overall settings). If it is disabled,
|
2028 |
+
the same hash might be encoded differently even if contains the same data,
|
2029 |
+
as key-value pairs have no inherent ordering in Perl.
|
2030 |
+
|
2031 |
+
This setting has no effect when decoding JSON texts.
|
2032 |
+
|
2033 |
+
If you want your own sorting routine, you can give a code reference
|
2034 |
+
or a subroutine name to C<sort_by>. See to C<JSON::PP OWN METHODS>.
|
2035 |
+
|
2036 |
+
=head2 allow_nonref
|
2037 |
+
|
2038 |
+
$json = $json->allow_nonref([$enable])
|
2039 |
+
|
2040 |
+
$enabled = $json->get_allow_nonref
|
2041 |
+
|
2042 |
+
If C<$enable> is true (or missing), then the C<encode> method can convert a
|
2043 |
+
non-reference into its corresponding string, number or null JSON value,
|
2044 |
+
which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
|
2045 |
+
values instead of croaking.
|
2046 |
+
|
2047 |
+
If C<$enable> is false, then the C<encode> method will croak if it isn't
|
2048 |
+
passed an arrayref or hashref, as JSON texts must either be an object
|
2049 |
+
or array. Likewise, C<decode> will croak if given something that is not a
|
2050 |
+
JSON object or array.
|
2051 |
+
|
2052 |
+
JSON::PP->new->allow_nonref->encode ("Hello, World!")
|
2053 |
+
=> "Hello, World!"
|
2054 |
+
|
2055 |
+
=head2 allow_unknown
|
2056 |
+
|
2057 |
+
$json = $json->allow_unknown ([$enable])
|
2058 |
+
|
2059 |
+
$enabled = $json->get_allow_unknown
|
2060 |
+
|
2061 |
+
If $enable is true (or missing), then "encode" will *not* throw an
|
2062 |
+
exception when it encounters values it cannot represent in JSON (for
|
2063 |
+
example, filehandles) but instead will encode a JSON "null" value.
|
2064 |
+
Note that blessed objects are not included here and are handled
|
2065 |
+
separately by c<allow_nonref>.
|
2066 |
+
|
2067 |
+
If $enable is false (the default), then "encode" will throw an
|
2068 |
+
exception when it encounters anything it cannot encode as JSON.
|
2069 |
+
|
2070 |
+
This option does not affect "decode" in any way, and it is
|
2071 |
+
recommended to leave it off unless you know your communications
|
2072 |
+
partner.
|
2073 |
+
|
2074 |
+
=head2 allow_blessed
|
2075 |
+
|
2076 |
+
$json = $json->allow_blessed([$enable])
|
2077 |
+
|
2078 |
+
$enabled = $json->get_allow_blessed
|
2079 |
+
|
2080 |
+
If C<$enable> is true (or missing), then the C<encode> method will not
|
2081 |
+
barf when it encounters a blessed reference. Instead, the value of the
|
2082 |
+
B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
|
2083 |
+
disabled or no C<TO_JSON> method found) or a representation of the
|
2084 |
+
object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
|
2085 |
+
encoded. Has no effect on C<decode>.
|
2086 |
+
|
2087 |
+
If C<$enable> is false (the default), then C<encode> will throw an
|
2088 |
+
exception when it encounters a blessed object.
|
2089 |
+
|
2090 |
+
=head2 convert_blessed
|
2091 |
+
|
2092 |
+
$json = $json->convert_blessed([$enable])
|
2093 |
+
|
2094 |
+
$enabled = $json->get_convert_blessed
|
2095 |
+
|
2096 |
+
If C<$enable> is true (or missing), then C<encode>, upon encountering a
|
2097 |
+
blessed object, will check for the availability of the C<TO_JSON> method
|
2098 |
+
on the object's class. If found, it will be called in scalar context
|
2099 |
+
and the resulting scalar will be encoded instead of the object. If no
|
2100 |
+
C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
|
2101 |
+
to do.
|
2102 |
+
|
2103 |
+
The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
|
2104 |
+
returns other blessed objects, those will be handled in the same
|
2105 |
+
way. C<TO_JSON> must take care of not causing an endless recursion cycle
|
2106 |
+
(== crash) in this case. The name of C<TO_JSON> was chosen because other
|
2107 |
+
methods called by the Perl core (== not by the user of the object) are
|
2108 |
+
usually in upper case letters and to avoid collisions with the C<to_json>
|
2109 |
+
function or method.
|
2110 |
+
|
2111 |
+
This setting does not yet influence C<decode> in any way.
|
2112 |
+
|
2113 |
+
If C<$enable> is false, then the C<allow_blessed> setting will decide what
|
2114 |
+
to do when a blessed object is found.
|
2115 |
+
|
2116 |
+
=head2 filter_json_object
|
2117 |
+
|
2118 |
+
$json = $json->filter_json_object([$coderef])
|
2119 |
+
|
2120 |
+
When C<$coderef> is specified, it will be called from C<decode> each
|
2121 |
+
time it decodes a JSON object. The only argument passed to the coderef
|
2122 |
+
is a reference to the newly-created hash. If the code references returns
|
2123 |
+
a single scalar (which need not be a reference), this value
|
2124 |
+
(i.e. a copy of that scalar to avoid aliasing) is inserted into the
|
2125 |
+
deserialised data structure. If it returns an empty list
|
2126 |
+
(NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
|
2127 |
+
hash will be inserted. This setting can slow down decoding considerably.
|
2128 |
+
|
2129 |
+
When C<$coderef> is omitted or undefined, any existing callback will
|
2130 |
+
be removed and C<decode> will not change the deserialised hash in any
|
2131 |
+
way.
|
2132 |
+
|
2133 |
+
Example, convert all JSON objects into the integer 5:
|
2134 |
+
|
2135 |
+
my $js = JSON::PP->new->filter_json_object (sub { 5 });
|
2136 |
+
# returns [5]
|
2137 |
+
$js->decode ('[{}]'); # the given subroutine takes a hash reference.
|
2138 |
+
# throw an exception because allow_nonref is not enabled
|
2139 |
+
# so a lone 5 is not allowed.
|
2140 |
+
$js->decode ('{"a":1, "b":2}');
|
2141 |
+
|
2142 |
+
=head2 filter_json_single_key_object
|
2143 |
+
|
2144 |
+
$json = $json->filter_json_single_key_object($key [=> $coderef])
|
2145 |
+
|
2146 |
+
Works remotely similar to C<filter_json_object>, but is only called for
|
2147 |
+
JSON objects having a single key named C<$key>.
|
2148 |
+
|
2149 |
+
This C<$coderef> is called before the one specified via
|
2150 |
+
C<filter_json_object>, if any. It gets passed the single value in the JSON
|
2151 |
+
object. If it returns a single value, it will be inserted into the data
|
2152 |
+
structure. If it returns nothing (not even C<undef> but the empty list),
|
2153 |
+
the callback from C<filter_json_object> will be called next, as if no
|
2154 |
+
single-key callback were specified.
|
2155 |
+
|
2156 |
+
If C<$coderef> is omitted or undefined, the corresponding callback will be
|
2157 |
+
disabled. There can only ever be one callback for a given key.
|
2158 |
+
|
2159 |
+
As this callback gets called less often then the C<filter_json_object>
|
2160 |
+
one, decoding speed will not usually suffer as much. Therefore, single-key
|
2161 |
+
objects make excellent targets to serialise Perl objects into, especially
|
2162 |
+
as single-key JSON objects are as close to the type-tagged value concept
|
2163 |
+
as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
|
2164 |
+
support this in any way, so you need to make sure your data never looks
|
2165 |
+
like a serialised Perl hash.
|
2166 |
+
|
2167 |
+
Typical names for the single object key are C<__class_whatever__>, or
|
2168 |
+
C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
|
2169 |
+
things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
|
2170 |
+
with real hashes.
|
2171 |
+
|
2172 |
+
Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
|
2173 |
+
into the corresponding C<< $WIDGET{<id>} >> object:
|
2174 |
+
|
2175 |
+
# return whatever is in $WIDGET{5}:
|
2176 |
+
JSON::PP
|
2177 |
+
->new
|
2178 |
+
->filter_json_single_key_object (__widget__ => sub {
|
2179 |
+
$WIDGET{ $_[0] }
|
2180 |
+
})
|
2181 |
+
->decode ('{"__widget__": 5')
|
2182 |
+
|
2183 |
+
# this can be used with a TO_JSON method in some "widget" class
|
2184 |
+
# for serialisation to json:
|
2185 |
+
sub WidgetBase::TO_JSON {
|
2186 |
+
my ($self) = @_;
|
2187 |
+
|
2188 |
+
unless ($self->{id}) {
|
2189 |
+
$self->{id} = ..get..some..id..;
|
2190 |
+
$WIDGET{$self->{id}} = $self;
|
2191 |
+
}
|
2192 |
+
|
2193 |
+
{ __widget__ => $self->{id} }
|
2194 |
+
}
|
2195 |
+
|
2196 |
+
=head2 shrink
|
2197 |
+
|
2198 |
+
$json = $json->shrink([$enable])
|
2199 |
+
|
2200 |
+
$enabled = $json->get_shrink
|
2201 |
+
|
2202 |
+
In JSON::XS, this flag resizes strings generated by either
|
2203 |
+
C<encode> or C<decode> to their minimum size possible.
|
2204 |
+
It will also try to downgrade any strings to octet-form if possible.
|
2205 |
+
|
2206 |
+
In JSON::PP, it is noop about resizing strings but tries
|
2207 |
+
C<utf8::downgrade> to the returned string by C<encode>.
|
2208 |
+
See to L<utf8>.
|
2209 |
+
|
2210 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
|
2211 |
+
|
2212 |
+
=head2 max_depth
|
2213 |
+
|
2214 |
+
$json = $json->max_depth([$maximum_nesting_depth])
|
2215 |
+
|
2216 |
+
$max_depth = $json->get_max_depth
|
2217 |
+
|
2218 |
+
Sets the maximum nesting level (default C<512>) accepted while encoding
|
2219 |
+
or decoding. If a higher nesting level is detected in JSON text or a Perl
|
2220 |
+
data structure, then the encoder and decoder will stop and croak at that
|
2221 |
+
point.
|
2222 |
+
|
2223 |
+
Nesting level is defined by number of hash- or arrayrefs that the encoder
|
2224 |
+
needs to traverse to reach a given point or the number of C<{> or C<[>
|
2225 |
+
characters without their matching closing parenthesis crossed to reach a
|
2226 |
+
given character in a string.
|
2227 |
+
|
2228 |
+
If no argument is given, the highest possible setting will be used, which
|
2229 |
+
is rarely useful.
|
2230 |
+
|
2231 |
+
See L<JSON::XS/SSECURITY CONSIDERATIONS> for more info on why this is useful.
|
2232 |
+
|
2233 |
+
When a large value (100 or more) was set and it de/encodes a deep nested object/text,
|
2234 |
+
it may raise a warning 'Deep recursion on subroutine' at the perl runtime phase.
|
2235 |
+
|
2236 |
+
=head2 max_size
|
2237 |
+
|
2238 |
+
$json = $json->max_size([$maximum_string_size])
|
2239 |
+
|
2240 |
+
$max_size = $json->get_max_size
|
2241 |
+
|
2242 |
+
Set the maximum length a JSON text may have (in bytes) where decoding is
|
2243 |
+
being attempted. The default is C<0>, meaning no limit. When C<decode>
|
2244 |
+
is called on a string that is longer then this many bytes, it will not
|
2245 |
+
attempt to decode the string but throw an exception. This setting has no
|
2246 |
+
effect on C<encode> (yet).
|
2247 |
+
|
2248 |
+
If no argument is given, the limit check will be deactivated (same as when
|
2249 |
+
C<0> is specified).
|
2250 |
+
|
2251 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
|
2252 |
+
|
2253 |
+
=head2 encode
|
2254 |
+
|
2255 |
+
$json_text = $json->encode($perl_scalar)
|
2256 |
+
|
2257 |
+
Converts the given Perl data structure (a simple scalar or a reference
|
2258 |
+
to a hash or array) to its JSON representation. Simple scalars will be
|
2259 |
+
converted into JSON string or number sequences, while references to arrays
|
2260 |
+
become JSON arrays and references to hashes become JSON objects. Undefined
|
2261 |
+
Perl values (e.g. C<undef>) become JSON C<null> values.
|
2262 |
+
References to the integers C<0> and C<1> are converted into C<true> and C<false>.
|
2263 |
+
|
2264 |
+
=head2 decode
|
2265 |
+
|
2266 |
+
$perl_scalar = $json->decode($json_text)
|
2267 |
+
|
2268 |
+
The opposite of C<encode>: expects a JSON text and tries to parse it,
|
2269 |
+
returning the resulting simple scalar or reference. Croaks on error.
|
2270 |
+
|
2271 |
+
JSON numbers and strings become simple Perl scalars. JSON arrays become
|
2272 |
+
Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
|
2273 |
+
C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
|
2274 |
+
C<null> becomes C<undef>.
|
2275 |
+
|
2276 |
+
=head2 decode_prefix
|
2277 |
+
|
2278 |
+
($perl_scalar, $characters) = $json->decode_prefix($json_text)
|
2279 |
+
|
2280 |
+
This works like the C<decode> method, but instead of raising an exception
|
2281 |
+
when there is trailing garbage after the first JSON object, it will
|
2282 |
+
silently stop parsing there and return the number of characters consumed
|
2283 |
+
so far.
|
2284 |
+
|
2285 |
+
JSON->new->decode_prefix ("[1] the tail")
|
2286 |
+
=> ([], 3)
|
2287 |
+
|
2288 |
+
=head1 INCREMENTAL PARSING
|
2289 |
+
|
2290 |
+
Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
|
2291 |
+
|
2292 |
+
In some cases, there is the need for incremental parsing of JSON texts.
|
2293 |
+
This module does allow you to parse a JSON stream incrementally.
|
2294 |
+
It does so by accumulating text until it has a full JSON object, which
|
2295 |
+
it then can decode. This process is similar to using C<decode_prefix>
|
2296 |
+
to see if a full JSON object is available, but is much more efficient
|
2297 |
+
(and can be implemented with a minimum of method calls).
|
2298 |
+
|
2299 |
+
This module will only attempt to parse the JSON text once it is sure it
|
2300 |
+
has enough text to get a decisive result, using a very simple but
|
2301 |
+
truly incremental parser. This means that it sometimes won't stop as
|
2302 |
+
early as the full parser, for example, it doesn't detect parenthesis
|
2303 |
+
mismatches. The only thing it guarantees is that it starts decoding as
|
2304 |
+
soon as a syntactically valid JSON text has been seen. This means you need
|
2305 |
+
to set resource limits (e.g. C<max_size>) to ensure the parser will stop
|
2306 |
+
parsing in the presence if syntax errors.
|
2307 |
+
|
2308 |
+
The following methods implement this incremental parser.
|
2309 |
+
|
2310 |
+
=head2 incr_parse
|
2311 |
+
|
2312 |
+
$json->incr_parse( [$string] ) # void context
|
2313 |
+
|
2314 |
+
$obj_or_undef = $json->incr_parse( [$string] ) # scalar context
|
2315 |
+
|
2316 |
+
@obj_or_empty = $json->incr_parse( [$string] ) # list context
|
2317 |
+
|
2318 |
+
This is the central parsing function. It can both append new text and
|
2319 |
+
extract objects from the stream accumulated so far (both of these
|
2320 |
+
functions are optional).
|
2321 |
+
|
2322 |
+
If C<$string> is given, then this string is appended to the already
|
2323 |
+
existing JSON fragment stored in the C<$json> object.
|
2324 |
+
|
2325 |
+
After that, if the function is called in void context, it will simply
|
2326 |
+
return without doing anything further. This can be used to add more text
|
2327 |
+
in as many chunks as you want.
|
2328 |
+
|
2329 |
+
If the method is called in scalar context, then it will try to extract
|
2330 |
+
exactly I<one> JSON object. If that is successful, it will return this
|
2331 |
+
object, otherwise it will return C<undef>. If there is a parse error,
|
2332 |
+
this method will croak just as C<decode> would do (one can then use
|
2333 |
+
C<incr_skip> to skip the erroneous part). This is the most common way of
|
2334 |
+
using the method.
|
2335 |
+
|
2336 |
+
And finally, in list context, it will try to extract as many objects
|
2337 |
+
from the stream as it can find and return them, or the empty list
|
2338 |
+
otherwise. For this to work, there must be no separators between the JSON
|
2339 |
+
objects or arrays, instead they must be concatenated back-to-back. If
|
2340 |
+
an error occurs, an exception will be raised as in the scalar context
|
2341 |
+
case. Note that in this case, any previously-parsed JSON texts will be
|
2342 |
+
lost.
|
2343 |
+
|
2344 |
+
Example: Parse some JSON arrays/objects in a given string and return them.
|
2345 |
+
|
2346 |
+
my @objs = JSON->new->incr_parse ("[5][7][1,2]");
|
2347 |
+
|
2348 |
+
=head2 incr_text
|
2349 |
+
|
2350 |
+
$lvalue_string = $json->incr_text
|
2351 |
+
|
2352 |
+
This method returns the currently stored JSON fragment as an lvalue, that
|
2353 |
+
is, you can manipulate it. This I<only> works when a preceding call to
|
2354 |
+
C<incr_parse> in I<scalar context> successfully returned an object. Under
|
2355 |
+
all other circumstances you must not call this function (I mean it.
|
2356 |
+
although in simple tests it might actually work, it I<will> fail under
|
2357 |
+
real world conditions). As a special exception, you can also call this
|
2358 |
+
method before having parsed anything.
|
2359 |
+
|
2360 |
+
This function is useful in two cases: a) finding the trailing text after a
|
2361 |
+
JSON object or b) parsing multiple JSON objects separated by non-JSON text
|
2362 |
+
(such as commas).
|
2363 |
+
|
2364 |
+
$json->incr_text =~ s/\s*,\s*//;
|
2365 |
+
|
2366 |
+
In Perl 5.005, C<lvalue> attribute is not available.
|
2367 |
+
You must write codes like the below:
|
2368 |
+
|
2369 |
+
$string = $json->incr_text;
|
2370 |
+
$string =~ s/\s*,\s*//;
|
2371 |
+
$json->incr_text( $string );
|
2372 |
+
|
2373 |
+
=head2 incr_skip
|
2374 |
+
|
2375 |
+
$json->incr_skip
|
2376 |
+
|
2377 |
+
This will reset the state of the incremental parser and will remove the
|
2378 |
+
parsed text from the input buffer. This is useful after C<incr_parse>
|
2379 |
+
died, in which case the input buffer and incremental parser state is left
|
2380 |
+
unchanged, to skip the text parsed so far and to reset the parse state.
|
2381 |
+
|
2382 |
+
=head2 incr_reset
|
2383 |
+
|
2384 |
+
$json->incr_reset
|
2385 |
+
|
2386 |
+
This completely resets the incremental parser, that is, after this call,
|
2387 |
+
it will be as if the parser had never parsed anything.
|
2388 |
+
|
2389 |
+
This is useful if you want to repeatedly parse JSON objects and want to
|
2390 |
+
ignore any trailing data, which means you have to reset the parser after
|
2391 |
+
each successful decode.
|
2392 |
+
|
2393 |
+
See to L<JSON::XS/INCREMENTAL PARSING> for examples.
|
2394 |
+
|
2395 |
+
|
2396 |
+
=head1 JSON::PP OWN METHODS
|
2397 |
+
|
2398 |
+
=head2 allow_singlequote
|
2399 |
+
|
2400 |
+
$json = $json->allow_singlequote([$enable])
|
2401 |
+
|
2402 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
2403 |
+
JSON strings quoted by single quotations that are invalid JSON
|
2404 |
+
format.
|
2405 |
+
|
2406 |
+
$json->allow_singlequote->decode({"foo":'bar'});
|
2407 |
+
$json->allow_singlequote->decode({'foo':"bar"});
|
2408 |
+
$json->allow_singlequote->decode({'foo':'bar'});
|
2409 |
+
|
2410 |
+
As same as the C<relaxed> option, this option may be used to parse
|
2411 |
+
application-specific files written by humans.
|
2412 |
+
|
2413 |
+
|
2414 |
+
=head2 allow_barekey
|
2415 |
+
|
2416 |
+
$json = $json->allow_barekey([$enable])
|
2417 |
+
|
2418 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
2419 |
+
bare keys of JSON object that are invalid JSON format.
|
2420 |
+
|
2421 |
+
As same as the C<relaxed> option, this option may be used to parse
|
2422 |
+
application-specific files written by humans.
|
2423 |
+
|
2424 |
+
$json->allow_barekey->decode('{foo:"bar"}');
|
2425 |
+
|
2426 |
+
=head2 allow_bignum
|
2427 |
+
|
2428 |
+
$json = $json->allow_bignum([$enable])
|
2429 |
+
|
2430 |
+
If C<$enable> is true (or missing), then C<decode> will convert
|
2431 |
+
the big integer Perl cannot handle as integer into a L<Math::BigInt>
|
2432 |
+
object and convert a floating number (any) into a L<Math::BigFloat>.
|
2433 |
+
|
2434 |
+
On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
2435 |
+
objects into JSON numbers with C<allow_blessed> enable.
|
2436 |
+
|
2437 |
+
$json->allow_nonref->allow_blessed->allow_bignum;
|
2438 |
+
$bigfloat = $json->decode('2.000000000000000000000000001');
|
2439 |
+
print $json->encode($bigfloat);
|
2440 |
+
# => 2.000000000000000000000000001
|
2441 |
+
|
2442 |
+
See to L<JSON::XS/MAPPING> about the normal conversion of JSON number.
|
2443 |
+
|
2444 |
+
=head2 loose
|
2445 |
+
|
2446 |
+
$json = $json->loose([$enable])
|
2447 |
+
|
2448 |
+
The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
|
2449 |
+
and the module doesn't allow to C<decode> to these (except for \x2f).
|
2450 |
+
If C<$enable> is true (or missing), then C<decode> will accept these
|
2451 |
+
unescaped strings.
|
2452 |
+
|
2453 |
+
$json->loose->decode(qq|["abc
|
2454 |
+
def"]|);
|
2455 |
+
|
2456 |
+
See L<JSON::XS/SSECURITY CONSIDERATIONS>.
|
2457 |
+
|
2458 |
+
=head2 escape_slash
|
2459 |
+
|
2460 |
+
$json = $json->escape_slash([$enable])
|
2461 |
+
|
2462 |
+
According to JSON Grammar, I<slash> (U+002F) is escaped. But default
|
2463 |
+
JSON::PP (as same as JSON::XS) encodes strings without escaping slash.
|
2464 |
+
|
2465 |
+
If C<$enable> is true (or missing), then C<encode> will escape slashes.
|
2466 |
+
|
2467 |
+
=head2 indent_length
|
2468 |
+
|
2469 |
+
$json = $json->indent_length($length)
|
2470 |
+
|
2471 |
+
JSON::XS indent space length is 3 and cannot be changed.
|
2472 |
+
JSON::PP set the indent space length with the given $length.
|
2473 |
+
The default is 3. The acceptable range is 0 to 15.
|
2474 |
+
|
2475 |
+
=head2 sort_by
|
2476 |
+
|
2477 |
+
$json = $json->sort_by($function_name)
|
2478 |
+
$json = $json->sort_by($subroutine_ref)
|
2479 |
+
|
2480 |
+
If $function_name or $subroutine_ref are set, its sort routine are used
|
2481 |
+
in encoding JSON objects.
|
2482 |
+
|
2483 |
+
$js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
|
2484 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
2485 |
+
|
2486 |
+
$js = $pc->sort_by('own_sort')->encode($obj);
|
2487 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
2488 |
+
|
2489 |
+
sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
|
2490 |
+
|
2491 |
+
As the sorting routine runs in the JSON::PP scope, the given
|
2492 |
+
subroutine name and the special variables C<$a>, C<$b> will begin
|
2493 |
+
'JSON::PP::'.
|
2494 |
+
|
2495 |
+
If $integer is set, then the effect is same as C<canonical> on.
|
2496 |
+
|
2497 |
+
=head1 INTERNAL
|
2498 |
+
|
2499 |
+
For developers.
|
2500 |
+
|
2501 |
+
=over
|
2502 |
+
|
2503 |
+
=item PP_encode_box
|
2504 |
+
|
2505 |
+
Returns
|
2506 |
+
|
2507 |
+
{
|
2508 |
+
depth => $depth,
|
2509 |
+
indent_count => $indent_count,
|
2510 |
+
}
|
2511 |
+
|
2512 |
+
|
2513 |
+
=item PP_decode_box
|
2514 |
+
|
2515 |
+
Returns
|
2516 |
+
|
2517 |
+
{
|
2518 |
+
text => $text,
|
2519 |
+
at => $at,
|
2520 |
+
ch => $ch,
|
2521 |
+
len => $len,
|
2522 |
+
depth => $depth,
|
2523 |
+
encoding => $encoding,
|
2524 |
+
is_valid_utf8 => $is_valid_utf8,
|
2525 |
+
};
|
2526 |
+
|
2527 |
+
=back
|
2528 |
+
|
2529 |
+
=head1 MAPPING
|
2530 |
+
|
2531 |
+
This section is copied from JSON::XS and modified to C<JSON::PP>.
|
2532 |
+
JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
|
2533 |
+
|
2534 |
+
See to L<JSON::XS/MAPPING>.
|
2535 |
+
|
2536 |
+
=head2 JSON -> PERL
|
2537 |
+
|
2538 |
+
=over 4
|
2539 |
+
|
2540 |
+
=item object
|
2541 |
+
|
2542 |
+
A JSON object becomes a reference to a hash in Perl. No ordering of object
|
2543 |
+
keys is preserved (JSON does not preserver object key ordering itself).
|
2544 |
+
|
2545 |
+
=item array
|
2546 |
+
|
2547 |
+
A JSON array becomes a reference to an array in Perl.
|
2548 |
+
|
2549 |
+
=item string
|
2550 |
+
|
2551 |
+
A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
|
2552 |
+
are represented by the same codepoints in the Perl string, so no manual
|
2553 |
+
decoding is necessary.
|
2554 |
+
|
2555 |
+
=item number
|
2556 |
+
|
2557 |
+
A JSON number becomes either an integer, numeric (floating point) or
|
2558 |
+
string scalar in perl, depending on its range and any fractional parts. On
|
2559 |
+
the Perl level, there is no difference between those as Perl handles all
|
2560 |
+
the conversion details, but an integer may take slightly less memory and
|
2561 |
+
might represent more values exactly than floating point numbers.
|
2562 |
+
|
2563 |
+
If the number consists of digits only, C<JSON> will try to represent
|
2564 |
+
it as an integer value. If that fails, it will try to represent it as
|
2565 |
+
a numeric (floating point) value if that is possible without loss of
|
2566 |
+
precision. Otherwise it will preserve the number as a string value (in
|
2567 |
+
which case you lose roundtripping ability, as the JSON number will be
|
2568 |
+
re-encoded to a JSON string).
|
2569 |
+
|
2570 |
+
Numbers containing a fractional or exponential part will always be
|
2571 |
+
represented as numeric (floating point) values, possibly at a loss of
|
2572 |
+
precision (in which case you might lose perfect roundtripping ability, but
|
2573 |
+
the JSON number will still be re-encoded as a JSON number).
|
2574 |
+
|
2575 |
+
Note that precision is not accuracy - binary floating point values cannot
|
2576 |
+
represent most decimal fractions exactly, and when converting from and to
|
2577 |
+
floating point, C<JSON> only guarantees precision up to but not including
|
2578 |
+
the least significant bit.
|
2579 |
+
|
2580 |
+
When C<allow_bignum> is enable, the big integers
|
2581 |
+
and the numeric can be optionally converted into L<Math::BigInt> and
|
2582 |
+
L<Math::BigFloat> objects.
|
2583 |
+
|
2584 |
+
=item true, false
|
2585 |
+
|
2586 |
+
These JSON atoms become C<JSON::PP::true> and C<JSON::PP::false>,
|
2587 |
+
respectively. They are overloaded to act almost exactly like the numbers
|
2588 |
+
C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
|
2589 |
+
the C<JSON::is_bool> function.
|
2590 |
+
|
2591 |
+
print JSON::PP::true . "\n";
|
2592 |
+
=> true
|
2593 |
+
print JSON::PP::true + 1;
|
2594 |
+
=> 1
|
2595 |
+
|
2596 |
+
ok(JSON::true eq '1');
|
2597 |
+
ok(JSON::true == 1);
|
2598 |
+
|
2599 |
+
C<JSON> will install these missing overloading features to the backend modules.
|
2600 |
+
|
2601 |
+
|
2602 |
+
=item null
|
2603 |
+
|
2604 |
+
A JSON null atom becomes C<undef> in Perl.
|
2605 |
+
|
2606 |
+
C<JSON::PP::null> returns C<undef>.
|
2607 |
+
|
2608 |
+
=back
|
2609 |
+
|
2610 |
+
|
2611 |
+
=head2 PERL -> JSON
|
2612 |
+
|
2613 |
+
The mapping from Perl to JSON is slightly more difficult, as Perl is a
|
2614 |
+
truly typeless language, so we can only guess which JSON type is meant by
|
2615 |
+
a Perl value.
|
2616 |
+
|
2617 |
+
=over 4
|
2618 |
+
|
2619 |
+
=item hash references
|
2620 |
+
|
2621 |
+
Perl hash references become JSON objects. As there is no inherent ordering
|
2622 |
+
in hash keys (or JSON objects), they will usually be encoded in a
|
2623 |
+
pseudo-random order that can change between runs of the same program but
|
2624 |
+
stays generally the same within a single run of a program. C<JSON>
|
2625 |
+
optionally sort the hash keys (determined by the I<canonical> flag), so
|
2626 |
+
the same data structure will serialise to the same JSON text (given same
|
2627 |
+
settings and version of JSON::XS), but this incurs a runtime overhead
|
2628 |
+
and is only rarely useful, e.g. when you want to compare some JSON text
|
2629 |
+
against another for equality.
|
2630 |
+
|
2631 |
+
|
2632 |
+
=item array references
|
2633 |
+
|
2634 |
+
Perl array references become JSON arrays.
|
2635 |
+
|
2636 |
+
=item other references
|
2637 |
+
|
2638 |
+
Other unblessed references are generally not allowed and will cause an
|
2639 |
+
exception to be thrown, except for references to the integers C<0> and
|
2640 |
+
C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
|
2641 |
+
also use C<JSON::false> and C<JSON::true> to improve readability.
|
2642 |
+
|
2643 |
+
to_json [\0,JSON::PP::true] # yields [false,true]
|
2644 |
+
|
2645 |
+
=item JSON::PP::true, JSON::PP::false, JSON::PP::null
|
2646 |
+
|
2647 |
+
These special values become JSON true and JSON false values,
|
2648 |
+
respectively. You can also use C<\1> and C<\0> directly if you want.
|
2649 |
+
|
2650 |
+
JSON::PP::null returns C<undef>.
|
2651 |
+
|
2652 |
+
=item blessed objects
|
2653 |
+
|
2654 |
+
Blessed objects are not directly representable in JSON. See the
|
2655 |
+
C<allow_blessed> and C<convert_blessed> methods on various options on
|
2656 |
+
how to deal with this: basically, you can choose between throwing an
|
2657 |
+
exception, encoding the reference as if it weren't blessed, or provide
|
2658 |
+
your own serialiser method.
|
2659 |
+
|
2660 |
+
See to L<convert_blessed>.
|
2661 |
+
|
2662 |
+
=item simple scalars
|
2663 |
+
|
2664 |
+
Simple Perl scalars (any scalar that is not a reference) are the most
|
2665 |
+
difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
|
2666 |
+
JSON C<null> values, scalars that have last been used in a string context
|
2667 |
+
before encoding as JSON strings, and anything else as number value:
|
2668 |
+
|
2669 |
+
# dump as number
|
2670 |
+
encode_json [2] # yields [2]
|
2671 |
+
encode_json [-3.0e17] # yields [-3e+17]
|
2672 |
+
my $value = 5; encode_json [$value] # yields [5]
|
2673 |
+
|
2674 |
+
# used as string, so dump as string
|
2675 |
+
print $value;
|
2676 |
+
encode_json [$value] # yields ["5"]
|
2677 |
+
|
2678 |
+
# undef becomes null
|
2679 |
+
encode_json [undef] # yields [null]
|
2680 |
+
|
2681 |
+
You can force the type to be a string by stringifying it:
|
2682 |
+
|
2683 |
+
my $x = 3.1; # some variable containing a number
|
2684 |
+
"$x"; # stringified
|
2685 |
+
$x .= ""; # another, more awkward way to stringify
|
2686 |
+
print $x; # perl does it for you, too, quite often
|
2687 |
+
|
2688 |
+
You can force the type to be a number by numifying it:
|
2689 |
+
|
2690 |
+
my $x = "3"; # some variable containing a string
|
2691 |
+
$x += 0; # numify it, ensuring it will be dumped as a number
|
2692 |
+
$x *= 1; # same thing, the choice is yours.
|
2693 |
+
|
2694 |
+
You can not currently force the type in other, less obscure, ways.
|
2695 |
+
|
2696 |
+
Note that numerical precision has the same meaning as under Perl (so
|
2697 |
+
binary to decimal conversion follows the same rules as in Perl, which
|
2698 |
+
can differ to other languages). Also, your perl interpreter might expose
|
2699 |
+
extensions to the floating point numbers of your platform, such as
|
2700 |
+
infinities or NaN's - these cannot be represented in JSON, and it is an
|
2701 |
+
error to pass those in.
|
2702 |
+
|
2703 |
+
=item Big Number
|
2704 |
+
|
2705 |
+
When C<allow_bignum> is enable,
|
2706 |
+
C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
2707 |
+
objects into JSON numbers.
|
2708 |
+
|
2709 |
+
|
2710 |
+
=back
|
2711 |
+
|
2712 |
+
=head1 UNICODE HANDLING ON PERLS
|
2713 |
+
|
2714 |
+
If you do not know about Unicode on Perl well,
|
2715 |
+
please check L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
|
2716 |
+
|
2717 |
+
=head2 Perl 5.8 and later
|
2718 |
+
|
2719 |
+
Perl can handle Unicode and the JSON::PP de/encode methods also work properly.
|
2720 |
+
|
2721 |
+
$json->allow_nonref->encode(chr hex 3042);
|
2722 |
+
$json->allow_nonref->encode(chr hex 12345);
|
2723 |
+
|
2724 |
+
Returns C<"\u3042"> and C<"\ud808\udf45"> respectively.
|
2725 |
+
|
2726 |
+
$json->allow_nonref->decode('"\u3042"');
|
2727 |
+
$json->allow_nonref->decode('"\ud808\udf45"');
|
2728 |
+
|
2729 |
+
Returns UTF-8 encoded strings with UTF8 flag, regarded as C<U+3042> and C<U+12345>.
|
2730 |
+
|
2731 |
+
Note that the versions from Perl 5.8.0 to 5.8.2, Perl built-in C<join> was broken,
|
2732 |
+
so JSON::PP wraps the C<join> with a subroutine. Thus JSON::PP works slow in the versions.
|
2733 |
+
|
2734 |
+
|
2735 |
+
=head2 Perl 5.6
|
2736 |
+
|
2737 |
+
Perl can handle Unicode and the JSON::PP de/encode methods also work.
|
2738 |
+
|
2739 |
+
=head2 Perl 5.005
|
2740 |
+
|
2741 |
+
Perl 5.005 is a byte semantics world -- all strings are sequences of bytes.
|
2742 |
+
That means the unicode handling is not available.
|
2743 |
+
|
2744 |
+
In encoding,
|
2745 |
+
|
2746 |
+
$json->allow_nonref->encode(chr hex 3042); # hex 3042 is 12354.
|
2747 |
+
$json->allow_nonref->encode(chr hex 12345); # hex 12345 is 74565.
|
2748 |
+
|
2749 |
+
Returns C<B> and C<E>, as C<chr> takes a value more than 255, it treats
|
2750 |
+
as C<$value % 256>, so the above codes are equivalent to :
|
2751 |
+
|
2752 |
+
$json->allow_nonref->encode(chr 66);
|
2753 |
+
$json->allow_nonref->encode(chr 69);
|
2754 |
+
|
2755 |
+
In decoding,
|
2756 |
+
|
2757 |
+
$json->decode('"\u00e3\u0081\u0082"');
|
2758 |
+
|
2759 |
+
The returned is a byte sequence C<0xE3 0x81 0x82> for UTF-8 encoded
|
2760 |
+
japanese character (C<HIRAGANA LETTER A>).
|
2761 |
+
And if it is represented in Unicode code point, C<U+3042>.
|
2762 |
+
|
2763 |
+
Next,
|
2764 |
+
|
2765 |
+
$json->decode('"\u3042"');
|
2766 |
+
|
2767 |
+
We ordinary expect the returned value is a Unicode character C<U+3042>.
|
2768 |
+
But here is 5.005 world. This is C<0xE3 0x81 0x82>.
|
2769 |
+
|
2770 |
+
$json->decode('"\ud808\udf45"');
|
2771 |
+
|
2772 |
+
This is not a character C<U+12345> but bytes - C<0xf0 0x92 0x8d 0x85>.
|
2773 |
+
|
2774 |
+
|
2775 |
+
=head1 TODO
|
2776 |
+
|
2777 |
+
=over
|
2778 |
+
|
2779 |
+
=item speed
|
2780 |
+
|
2781 |
+
=item memory saving
|
2782 |
+
|
2783 |
+
=back
|
2784 |
+
|
2785 |
+
|
2786 |
+
=head1 SEE ALSO
|
2787 |
+
|
2788 |
+
Most of the document are copied and modified from JSON::XS doc.
|
2789 |
+
|
2790 |
+
L<JSON::XS>
|
2791 |
+
|
2792 |
+
RFC4627 (L<http://www.ietf.org/rfc/rfc4627.txt>)
|
2793 |
+
|
2794 |
+
=head1 AUTHOR
|
2795 |
+
|
2796 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
2797 |
+
|
2798 |
+
|
2799 |
+
=head1 COPYRIGHT AND LICENSE
|
2800 |
+
|
2801 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
2802 |
+
|
2803 |
+
This library is free software; you can redistribute it and/or modify
|
2804 |
+
it under the same terms as Perl itself.
|
2805 |
+
|
2806 |
+
=cut
|
uroman/lib/JSON/backportPP/Boolean.pm
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
=head1 NAME
|
2 |
+
|
3 |
+
JSON::PP::Boolean - dummy module providing JSON::PP::Boolean
|
4 |
+
|
5 |
+
=head1 SYNOPSIS
|
6 |
+
|
7 |
+
# do not "use" yourself
|
8 |
+
|
9 |
+
=head1 DESCRIPTION
|
10 |
+
|
11 |
+
This module exists only to provide overload resolution for Storable
|
12 |
+
and similar modules. See L<JSON::PP> for more info about this class.
|
13 |
+
|
14 |
+
=cut
|
15 |
+
|
16 |
+
use JSON::backportPP ();
|
17 |
+
use strict;
|
18 |
+
|
19 |
+
1;
|
20 |
+
|
21 |
+
=head1 AUTHOR
|
22 |
+
|
23 |
+
This idea is from L<JSON::XS::Boolean> written by
|
24 |
+
Marc Lehmann <schmorp[at]schmorp.de>
|
25 |
+
|
26 |
+
=cut
|
27 |
+
|
uroman/lib/JSON/backportPP/Compat5005.pm
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::backportPP5005;
|
3 |
+
|
4 |
+
use 5.005;
|
5 |
+
use strict;
|
6 |
+
|
7 |
+
my @properties;
|
8 |
+
|
9 |
+
$JSON::PP5005::VERSION = '1.10';
|
10 |
+
|
11 |
+
BEGIN {
|
12 |
+
|
13 |
+
sub utf8::is_utf8 {
|
14 |
+
0; # It is considered that UTF8 flag off for Perl 5.005.
|
15 |
+
}
|
16 |
+
|
17 |
+
sub utf8::upgrade {
|
18 |
+
}
|
19 |
+
|
20 |
+
sub utf8::downgrade {
|
21 |
+
1; # must always return true.
|
22 |
+
}
|
23 |
+
|
24 |
+
sub utf8::encode {
|
25 |
+
}
|
26 |
+
|
27 |
+
sub utf8::decode {
|
28 |
+
}
|
29 |
+
|
30 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
31 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
32 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
|
33 |
+
*JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
|
34 |
+
|
35 |
+
# missing in B module.
|
36 |
+
sub B::SVp_IOK () { 0x01000000; }
|
37 |
+
sub B::SVp_NOK () { 0x02000000; }
|
38 |
+
sub B::SVp_POK () { 0x04000000; }
|
39 |
+
|
40 |
+
$INC{'bytes.pm'} = 1; # dummy
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
sub _encode_ascii {
|
46 |
+
join('', map { $_ <= 127 ? chr($_) : sprintf('\u%04x', $_) } unpack('C*', $_[0]) );
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
sub _encode_latin1 {
|
51 |
+
join('', map { chr($_) } unpack('C*', $_[0]) );
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
sub _decode_surrogates { # from http://homepage1.nifty.com/nomenclator/unicode/ucs_utf.htm
|
56 |
+
my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00); # from perlunicode
|
57 |
+
my $bit = unpack('B32', pack('N', $uni));
|
58 |
+
|
59 |
+
if ( $bit =~ /^00000000000(...)(......)(......)(......)$/ ) {
|
60 |
+
my ($w, $x, $y, $z) = ($1, $2, $3, $4);
|
61 |
+
return pack('B*', sprintf('11110%s10%s10%s10%s', $w, $x, $y, $z));
|
62 |
+
}
|
63 |
+
else {
|
64 |
+
Carp::croak("Invalid surrogate pair");
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
sub _decode_unicode {
|
70 |
+
my ($u) = @_;
|
71 |
+
my ($utf8bit);
|
72 |
+
|
73 |
+
if ( $u =~ /^00([89a-f][0-9a-f])$/i ) { # 0x80-0xff
|
74 |
+
return pack( 'H2', $1 );
|
75 |
+
}
|
76 |
+
|
77 |
+
my $bit = unpack("B*", pack("H*", $u));
|
78 |
+
|
79 |
+
if ( $bit =~ /^00000(.....)(......)$/ ) {
|
80 |
+
$utf8bit = sprintf('110%s10%s', $1, $2);
|
81 |
+
}
|
82 |
+
elsif ( $bit =~ /^(....)(......)(......)$/ ) {
|
83 |
+
$utf8bit = sprintf('1110%s10%s10%s', $1, $2, $3);
|
84 |
+
}
|
85 |
+
else {
|
86 |
+
Carp::croak("Invalid escaped unicode");
|
87 |
+
}
|
88 |
+
|
89 |
+
return pack('B*', $utf8bit);
|
90 |
+
}
|
91 |
+
|
92 |
+
|
93 |
+
sub JSON::PP::incr_text {
|
94 |
+
$_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
|
95 |
+
|
96 |
+
if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
|
97 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
98 |
+
}
|
99 |
+
|
100 |
+
$_[0]->{_incr_parser}->{incr_text} = $_[1] if ( @_ > 1 );
|
101 |
+
$_[0]->{_incr_parser}->{incr_text};
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
1;
|
106 |
+
__END__
|
107 |
+
|
108 |
+
=pod
|
109 |
+
|
110 |
+
=head1 NAME
|
111 |
+
|
112 |
+
JSON::PP5005 - Helper module in using JSON::PP in Perl 5.005
|
113 |
+
|
114 |
+
=head1 DESCRIPTION
|
115 |
+
|
116 |
+
JSON::PP calls internally.
|
117 |
+
|
118 |
+
=head1 AUTHOR
|
119 |
+
|
120 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
121 |
+
|
122 |
+
|
123 |
+
=head1 COPYRIGHT AND LICENSE
|
124 |
+
|
125 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
126 |
+
|
127 |
+
This library is free software; you can redistribute it and/or modify
|
128 |
+
it under the same terms as Perl itself.
|
129 |
+
|
130 |
+
=cut
|
131 |
+
|
uroman/lib/JSON/backportPP/Compat5006.pm
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::backportPP56;
|
3 |
+
|
4 |
+
use 5.006;
|
5 |
+
use strict;
|
6 |
+
|
7 |
+
my @properties;
|
8 |
+
|
9 |
+
$JSON::PP56::VERSION = '1.08';
|
10 |
+
|
11 |
+
BEGIN {
|
12 |
+
|
13 |
+
sub utf8::is_utf8 {
|
14 |
+
my $len = length $_[0]; # char length
|
15 |
+
{
|
16 |
+
use bytes; # byte length;
|
17 |
+
return $len != length $_[0]; # if !=, UTF8-flagged on.
|
18 |
+
}
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
sub utf8::upgrade {
|
23 |
+
; # noop;
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
sub utf8::downgrade ($;$) {
|
28 |
+
return 1 unless ( utf8::is_utf8( $_[0] ) );
|
29 |
+
|
30 |
+
if ( _is_valid_utf8( $_[0] ) ) {
|
31 |
+
my $downgrade;
|
32 |
+
for my $c ( unpack( "U*", $_[0] ) ) {
|
33 |
+
if ( $c < 256 ) {
|
34 |
+
$downgrade .= pack("C", $c);
|
35 |
+
}
|
36 |
+
else {
|
37 |
+
$downgrade .= pack("U", $c);
|
38 |
+
}
|
39 |
+
}
|
40 |
+
$_[0] = $downgrade;
|
41 |
+
return 1;
|
42 |
+
}
|
43 |
+
else {
|
44 |
+
Carp::croak("Wide character in subroutine entry") unless ( $_[1] );
|
45 |
+
0;
|
46 |
+
}
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
sub utf8::encode ($) { # UTF8 flag off
|
51 |
+
if ( utf8::is_utf8( $_[0] ) ) {
|
52 |
+
$_[0] = pack( "C*", unpack( "C*", $_[0] ) );
|
53 |
+
}
|
54 |
+
else {
|
55 |
+
$_[0] = pack( "U*", unpack( "C*", $_[0] ) );
|
56 |
+
$_[0] = pack( "C*", unpack( "C*", $_[0] ) );
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
|
61 |
+
sub utf8::decode ($) { # UTF8 flag on
|
62 |
+
if ( _is_valid_utf8( $_[0] ) ) {
|
63 |
+
utf8::downgrade( $_[0] );
|
64 |
+
$_[0] = pack( "U*", unpack( "U*", $_[0] ) );
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
70 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
71 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&JSON::PP::_decode_surrogates;
|
72 |
+
*JSON::PP::JSON_PP_decode_unicode = \&JSON::PP::_decode_unicode;
|
73 |
+
|
74 |
+
unless ( defined &B::SVp_NOK ) { # missing in B module.
|
75 |
+
eval q{ sub B::SVp_NOK () { 0x02000000; } };
|
76 |
+
}
|
77 |
+
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
sub _encode_ascii {
|
83 |
+
join('',
|
84 |
+
map {
|
85 |
+
$_ <= 127 ?
|
86 |
+
chr($_) :
|
87 |
+
$_ <= 65535 ?
|
88 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
|
89 |
+
} _unpack_emu($_[0])
|
90 |
+
);
|
91 |
+
}
|
92 |
+
|
93 |
+
|
94 |
+
sub _encode_latin1 {
|
95 |
+
join('',
|
96 |
+
map {
|
97 |
+
$_ <= 255 ?
|
98 |
+
chr($_) :
|
99 |
+
$_ <= 65535 ?
|
100 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
|
101 |
+
} _unpack_emu($_[0])
|
102 |
+
);
|
103 |
+
}
|
104 |
+
|
105 |
+
|
106 |
+
sub _unpack_emu { # for Perl 5.6 unpack warnings
|
107 |
+
return !utf8::is_utf8($_[0]) ? unpack('C*', $_[0])
|
108 |
+
: _is_valid_utf8($_[0]) ? unpack('U*', $_[0])
|
109 |
+
: unpack('C*', $_[0]);
|
110 |
+
}
|
111 |
+
|
112 |
+
|
113 |
+
sub _is_valid_utf8 {
|
114 |
+
my $str = $_[0];
|
115 |
+
my $is_utf8;
|
116 |
+
|
117 |
+
while ($str =~ /(?:
|
118 |
+
(
|
119 |
+
[\x00-\x7F]
|
120 |
+
|[\xC2-\xDF][\x80-\xBF]
|
121 |
+
|[\xE0][\xA0-\xBF][\x80-\xBF]
|
122 |
+
|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
|
123 |
+
|[\xED][\x80-\x9F][\x80-\xBF]
|
124 |
+
|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
|
125 |
+
|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
|
126 |
+
|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
|
127 |
+
|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
|
128 |
+
)
|
129 |
+
| (.)
|
130 |
+
)/xg)
|
131 |
+
{
|
132 |
+
if (defined $1) {
|
133 |
+
$is_utf8 = 1 if (!defined $is_utf8);
|
134 |
+
}
|
135 |
+
else {
|
136 |
+
$is_utf8 = 0 if (!defined $is_utf8);
|
137 |
+
if ($is_utf8) { # eventually, not utf8
|
138 |
+
return;
|
139 |
+
}
|
140 |
+
}
|
141 |
+
}
|
142 |
+
|
143 |
+
return $is_utf8;
|
144 |
+
}
|
145 |
+
|
146 |
+
|
147 |
+
1;
|
148 |
+
__END__
|
149 |
+
|
150 |
+
=pod
|
151 |
+
|
152 |
+
=head1 NAME
|
153 |
+
|
154 |
+
JSON::PP56 - Helper module in using JSON::PP in Perl 5.6
|
155 |
+
|
156 |
+
=head1 DESCRIPTION
|
157 |
+
|
158 |
+
JSON::PP calls internally.
|
159 |
+
|
160 |
+
=head1 AUTHOR
|
161 |
+
|
162 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
163 |
+
|
164 |
+
|
165 |
+
=head1 COPYRIGHT AND LICENSE
|
166 |
+
|
167 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
168 |
+
|
169 |
+
This library is free software; you can redistribute it and/or modify
|
170 |
+
it under the same terms as Perl itself.
|
171 |
+
|
172 |
+
=cut
|
173 |
+
|
uroman/lib/NLP/Chinese.pm
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# Chinese #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::Chinese;
|
8 |
+
|
9 |
+
$utf8 = NLP::UTF8;
|
10 |
+
%empty_ht = ();
|
11 |
+
|
12 |
+
sub read_chinese_tonal_pinyin_files {
|
13 |
+
local($caller, *ht, @filenames) = @_;
|
14 |
+
|
15 |
+
$n_kHanyuPinlu = 0;
|
16 |
+
$n_kXHC1983 = 0;
|
17 |
+
$n_kHanyuPinyin = 0;
|
18 |
+
$n_kMandarin = 0;
|
19 |
+
$n_cedict = 0;
|
20 |
+
$n_simple_pinyin = 0;
|
21 |
+
|
22 |
+
foreach $filename (@filenames) {
|
23 |
+
if ($filename =~ /unihan/i) {
|
24 |
+
my $line_number = 0;
|
25 |
+
if (open(IN, $filename)) {
|
26 |
+
while (<IN>) {
|
27 |
+
$line_number++;
|
28 |
+
next if /^#/;
|
29 |
+
s/\s*$//;
|
30 |
+
if (($u, $type, $value) = split(/\t/, $_)) {
|
31 |
+
if ($type =~ /^(kHanyuPinlu|kXHC1983|kHanyuPinyin|kMandarin)$/) {
|
32 |
+
$u = $util->trim($u);
|
33 |
+
$type = $util->trim($type);
|
34 |
+
$value = $util->trim($value);
|
35 |
+
$f = $utf8->unicode_string2string($u);
|
36 |
+
|
37 |
+
if ($type eq "kHanyuPinlu") {
|
38 |
+
$value =~ s/\(.*?\)//g;
|
39 |
+
$value = $util->trim($value);
|
40 |
+
$translit = $caller->number_to_accent_tone($value);
|
41 |
+
$ht{"kHanyuPinlu"}->{$f} = $translit;
|
42 |
+
$n_kHanyuPinlu++;
|
43 |
+
} elsif ($type eq "kXHC1983") {
|
44 |
+
@translits = ($value =~ /:(\S+)/g);
|
45 |
+
$translit = join(" ", @translits);
|
46 |
+
$ht{"kXHC1983"}->{$f} = $translit;
|
47 |
+
$n_kXHC1983++;
|
48 |
+
} elsif ($type eq "kHanyuPinyin") {
|
49 |
+
$value =~ s/^.*://;
|
50 |
+
$value =~ s/,/ /g;
|
51 |
+
$ht{"kHanyuPinyin"}->{$f} = $value;
|
52 |
+
$n_kHanyuPinyin++;
|
53 |
+
} elsif ($type eq "kMandarin") {
|
54 |
+
$ht{"kMandarin"}->{$f} = $value;
|
55 |
+
$n_kMandarin++;
|
56 |
+
}
|
57 |
+
}
|
58 |
+
}
|
59 |
+
}
|
60 |
+
close(IN);
|
61 |
+
print "Read in $n_kHanyuPinlu kHanyuPinlu, $n_kXHC1983 n_kXHC1983, $n_kHanyuPinyin n_kHanyuPinyin $n_kMandarin n_kMandarin\n";
|
62 |
+
} else {
|
63 |
+
print STDERR "Can't open $filename\n";
|
64 |
+
}
|
65 |
+
} elsif ($filename =~ /cedict/i) {
|
66 |
+
if (open(IN, $filename)) {
|
67 |
+
my $line_number = 0;
|
68 |
+
while (<IN>) {
|
69 |
+
$line_number++;
|
70 |
+
next if /^#/;
|
71 |
+
s/\s*$//;
|
72 |
+
if (($f, $translit) = ($_ =~ /^\S+\s+(\S+)\s+\[([^\[\]]+)\]/)) {
|
73 |
+
$translit = $utf8->extended_lower_case($translit);
|
74 |
+
$translit = $caller->number_to_accent_tone($translit);
|
75 |
+
$translit =~ s/\s//g;
|
76 |
+
if ($old_translit = $ht{"cedict"}->{$f}) {
|
77 |
+
# $ht{CONFLICT}->{("DUPLICATE " . $f)} = "CEDICT($f): $old_translit\nCEDICT($f): $translit (duplicate)\n" unless $translit eq $old_translit;
|
78 |
+
$ht{"cedicts"}->{$f} = join(" ", $ht{"cedicts"}->{$f}, $translit) unless $old_translit eq $translit;
|
79 |
+
} else {
|
80 |
+
$ht{"cedict"}->{$f} = $translit;
|
81 |
+
$ht{"cedicts"}->{$f} = $translit;
|
82 |
+
}
|
83 |
+
$n_cedict++;
|
84 |
+
}
|
85 |
+
}
|
86 |
+
close(IN);
|
87 |
+
# print "Read in $n_cedict n_cedict\n";
|
88 |
+
} else {
|
89 |
+
print STDERR "Can't open $filename";
|
90 |
+
}
|
91 |
+
} elsif ($filename =~ /chinese_to_pinyin/i) {
|
92 |
+
if (open(IN, $filename)) {
|
93 |
+
my $line_number = 0;
|
94 |
+
while (<IN>) {
|
95 |
+
$line_number++;
|
96 |
+
next if /^#/;
|
97 |
+
if (($f, $translit) = ($_ =~ /^(\S+)\t(\S+)\s*$/)) {
|
98 |
+
$ht{"simple_pinyin"}->{$f} = $translit;
|
99 |
+
$n_simple_pinyin++;
|
100 |
+
}
|
101 |
+
}
|
102 |
+
close(IN);
|
103 |
+
# print "Read in $n_simple_pinyin n_simple_pinyin\n";
|
104 |
+
} else {
|
105 |
+
print STDERR "Can't open $filename";
|
106 |
+
}
|
107 |
+
} else {
|
108 |
+
print STDERR "Don't know what to do with file $filename (in read_chinese_tonal_pinyin_files)\n";
|
109 |
+
}
|
110 |
+
}
|
111 |
+
}
|
112 |
+
|
113 |
+
sub tonal_pinyin {
|
114 |
+
local($caller, $s, *ht, $gloss) = @_;
|
115 |
+
|
116 |
+
return $result if defined($result = $ht{COMBINED}->{$s});
|
117 |
+
|
118 |
+
$cedict_pinyin = $ht{"cedict"}->{$s} || "";
|
119 |
+
$cedicts_pinyin = $ht{"cedicts"}->{$s} || "";
|
120 |
+
$unihan_pinyin = "";
|
121 |
+
@characters = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
122 |
+
foreach $c (@characters) {
|
123 |
+
if ($pinyin = $ht{"simple_pinyin"}->{$c}) {
|
124 |
+
$unihan_pinyin .= $pinyin;
|
125 |
+
} elsif ($pinyin = $ht{"kHanyuPinlu"}->{$c}) {
|
126 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
127 |
+
$unihan_pinyin .= $pinyin;
|
128 |
+
} elsif ($pinyin = $ht{"kXHC1983"}->{$c}) {
|
129 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
130 |
+
$unihan_pinyin .= $pinyin;
|
131 |
+
} elsif ($pinyin = $ht{"kHanyuPinyin"}->{$c}) {
|
132 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
133 |
+
$unihan_pinyin .= $pinyin;
|
134 |
+
} elsif ($pinyin = $ht{"cedicts"}->{$c}) {
|
135 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
136 |
+
$unihan_pinyin .= $pinyin;
|
137 |
+
# middle dot, katakana middle dot, multiplication sign
|
138 |
+
} elsif ($c =~ /^(\xC2\xB7|\xE3\x83\xBB|\xC3\x97)$/) {
|
139 |
+
$unihan_pinyin .= $c;
|
140 |
+
# ASCII
|
141 |
+
} elsif ($c =~ /^([\x21-\x7E])$/) {
|
142 |
+
$unihan_pinyin .= $c;
|
143 |
+
} else {
|
144 |
+
$unihan_pinyin .= "?";
|
145 |
+
$hex = $utf8->utf8_to_hex($c);
|
146 |
+
$unicode = uc $utf8->utf8_to_4hex_unicode($c);
|
147 |
+
# print STDERR "Tonal pinyin: Unknown character $c ($hex/U+$unicode) -> ?\n";
|
148 |
+
}
|
149 |
+
}
|
150 |
+
$pinyin_title = "";
|
151 |
+
if (($#characters >= 1) && $cedicts_pinyin) {
|
152 |
+
foreach $pinyin (split(/\s+/, $cedicts_pinyin)) {
|
153 |
+
$pinyin_title .= "$s $pinyin (CEDICT)\n";
|
154 |
+
}
|
155 |
+
$pinyin_title .= "\n";
|
156 |
+
}
|
157 |
+
foreach $c (@characters) {
|
158 |
+
my %local_ht = ();
|
159 |
+
@pinyins = ();
|
160 |
+
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin", "cedicts")) {
|
161 |
+
if ($pinyin_s = $ht{$type}->{$c}) {
|
162 |
+
foreach $pinyin (split(/\s+/, $pinyin_s)) {
|
163 |
+
push(@pinyins, $pinyin) unless $util->member($pinyin, @pinyins);
|
164 |
+
$type2 = ($type eq "cedicts") ? "CEDICT" : $type;
|
165 |
+
$local_ht{$pinyin} = ($local_ht{$pinyin}) ? join(", ", $local_ht{$pinyin}, $type2) : $type2;
|
166 |
+
}
|
167 |
+
}
|
168 |
+
}
|
169 |
+
foreach $pinyin (@pinyins) {
|
170 |
+
$type_s = $local_ht{$pinyin};
|
171 |
+
$pinyin_title .= "$c $pinyin ($type_s)\n";
|
172 |
+
}
|
173 |
+
}
|
174 |
+
$pinyin_title =~ s/\n$//;
|
175 |
+
$pinyin_title =~ s/\n/
/g;
|
176 |
+
$unihan_pinyin = "" if $unihan_pinyin =~ /^\?+$/;
|
177 |
+
if (($#characters >= 1) && $cedict_pinyin && $unihan_pinyin && ($unihan_pinyin ne $cedict_pinyin)) {
|
178 |
+
$log = "Gloss($s): $gloss\nCEdict($s): $cedicts_pinyin\nUnihan($s): $unihan_pinyin\n";
|
179 |
+
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin")) {
|
180 |
+
$log_line = "$type($s): ";
|
181 |
+
foreach $c (@characters) {
|
182 |
+
$pinyin = $ht{$type}->{$c} || "";
|
183 |
+
if ($pinyin =~ / /) {
|
184 |
+
$log_line .= "($pinyin)";
|
185 |
+
} elsif ($pinyin) {
|
186 |
+
$log_line .= $pinyin;
|
187 |
+
} else {
|
188 |
+
$log_line .= "?";
|
189 |
+
}
|
190 |
+
}
|
191 |
+
$log .= "$log_line\n";
|
192 |
+
}
|
193 |
+
$ht{CONFLICT}->{$s} = $log;
|
194 |
+
}
|
195 |
+
$result = $unihan_pinyin || $cedict_pinyin;
|
196 |
+
$result = $cedict_pinyin if ($#characters > 0) && $cedict_pinyin;
|
197 |
+
$ht{COMBINED}->{$s} = $result;
|
198 |
+
$ht{PINYIN_TITLE}->{$s} = $pinyin_title;
|
199 |
+
return $result;
|
200 |
+
}
|
201 |
+
|
202 |
+
%number_to_accent_tone_ht = (
|
203 |
+
"a1", "\xC4\x81", "a2", "\xC3\xA1", "a3", "\xC7\x8E", "a4", "\xC3\xA0",
|
204 |
+
"e1", "\xC4\x93", "e2", "\xC3\xA9", "e3", "\xC4\x9B", "e4", "\xC3\xA8",
|
205 |
+
"i1", "\xC4\xAB", "i2", "\xC3\xAD", "i3", "\xC7\x90", "i4", "\xC3\xAC",
|
206 |
+
"o1", "\xC5\x8D", "o2", "\xC3\xB3", "o3", "\xC7\x92", "o4", "\xC3\xB2",
|
207 |
+
"u1", "\xC5\xAB", "u2", "\xC3\xBA", "u3", "\xC7\x94", "u4", "\xC3\xB9",
|
208 |
+
"u:1","\xC7\x96", "u:2","\xC7\x98", "u:3","\xC7\x9A", "u:4","\xC7\x9C",
|
209 |
+
"\xC3\xBC1","\xC7\x96","\xC3\xBC2","\xC7\x98","\xC3\xBC3","\xC7\x9A","\xC3\xBC4","\xC7\x9C"
|
210 |
+
);
|
211 |
+
|
212 |
+
sub number_to_accent_tone {
|
213 |
+
local($caller, $s) = @_;
|
214 |
+
|
215 |
+
my $result = "";
|
216 |
+
while (($pre,$alpha,$tone_number,$rest) = ($s =~ /^(.*?)((?:[a-z]|u:|\xC3\xBC)+)([1-5])(.*)$/i)) {
|
217 |
+
if ($tone_number eq "5") {
|
218 |
+
$result .= "$pre$alpha";
|
219 |
+
} elsif ((($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)([ae])(.*)$/))
|
220 |
+
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(o)(u.*)$/))
|
221 |
+
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(u:|[iou]|\xC3\xBC)([^aeiou]*)$/))) {
|
222 |
+
$result .= "$pre$pre_acc" . ($number_to_accent_tone_ht{($acc_letter . $tone_number)} || ($acc_letter . $tone_number)) . $post_acc;
|
223 |
+
} else {
|
224 |
+
$result .= "$pre$alpha$tone_number";
|
225 |
+
}
|
226 |
+
$s = $rest;
|
227 |
+
}
|
228 |
+
$result .= $s;
|
229 |
+
$result =~ s/u:/\xC3\xBC/g;
|
230 |
+
return $result;
|
231 |
+
}
|
232 |
+
|
233 |
+
sub string_contains_utf8_cjk_unified_ideograph_p {
|
234 |
+
local($caller, $s) = @_;
|
235 |
+
|
236 |
+
return ($s =~ /([\xE4-\xE9]|\xE3[\x90-\xBF]|\xF0[\xA0-\xAC])/);
|
237 |
+
}
|
238 |
+
|
239 |
+
1;
|
uroman/lib/NLP/English.pm
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/lib/NLP/Romanizer.pm
ADDED
@@ -0,0 +1,2020 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# Romanizer #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::Romanizer;
|
8 |
+
|
9 |
+
use NLP::Chinese;
|
10 |
+
use NLP::UTF8;
|
11 |
+
use NLP::utilities;
|
12 |
+
use JSON;
|
13 |
+
$utf8 = NLP::UTF8;
|
14 |
+
$util = NLP::utilities;
|
15 |
+
$chinesePM = NLP::Chinese;
|
16 |
+
|
17 |
+
my $verbosePM = 0;
|
18 |
+
%empty_ht = ();
|
19 |
+
|
20 |
+
my $braille_capital_letter_indicator = "\xE2\xA0\xA0";
|
21 |
+
my $braille_number_indicator = "\xE2\xA0\xBC";
|
22 |
+
my $braille_decimal_point = "\xE2\xA0\xA8";
|
23 |
+
my $braille_comma = "\xE2\xA0\x82";
|
24 |
+
my $braille_solidus = "\xE2\xA0\x8C";
|
25 |
+
my $braille_numeric_space = "\xE2\xA0\x90";
|
26 |
+
my $braille_letter_indicator = "\xE2\xA0\xB0";
|
27 |
+
my $braille_period = "\xE2\xA0\xB2";
|
28 |
+
|
29 |
+
sub new {
|
30 |
+
local($caller) = @_;
|
31 |
+
|
32 |
+
my $object = {};
|
33 |
+
my $class = ref( $caller ) || $caller;
|
34 |
+
bless($object, $class);
|
35 |
+
return $object;
|
36 |
+
}
|
37 |
+
|
38 |
+
sub load_unicode_data {
|
39 |
+
local($this, *ht, $filename) = @_;
|
40 |
+
# ../../data/UnicodeData.txt
|
41 |
+
|
42 |
+
$n = 0;
|
43 |
+
if (open(IN, $filename)) {
|
44 |
+
while (<IN>) {
|
45 |
+
if (($unicode_value, $char_name, $general_category, $canon_comb_classes, $bidir_category, $char_decomp_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $comment_field, $uc_mapping, $lc_mapping, $title_case_mapping) = split(";", $_)) {
|
46 |
+
$utf8_code = $utf8->unicode_hex_string2string($unicode_value);
|
47 |
+
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name;
|
48 |
+
$ht{UTF_NAME_TO_UNICODE}->{$char_name} = $unicode_value;
|
49 |
+
$ht{UTF_NAME_TO_CODE}->{$char_name} = $utf8_code;
|
50 |
+
$ht{UTF_TO_CAT}->{$utf8_code} = $general_category;
|
51 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric_value unless $numeric_value eq "";
|
52 |
+
$n++;
|
53 |
+
}
|
54 |
+
}
|
55 |
+
close(IN);
|
56 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
57 |
+
} else {
|
58 |
+
print STDERR "Can't open $filename\n";
|
59 |
+
}
|
60 |
+
}
|
61 |
+
|
62 |
+
sub load_unicode_overwrite_romanization {
|
63 |
+
local($this, *ht, $filename) = @_;
|
64 |
+
# ../../data/UnicodeDataOverwrite.txt
|
65 |
+
|
66 |
+
$n = 0;
|
67 |
+
if (open(IN, $filename)) {
|
68 |
+
while (<IN>) {
|
69 |
+
next if /^#/;
|
70 |
+
$unicode_value = $util->slot_value_in_double_colon_del_list($_, "u");
|
71 |
+
$romanization = $util->slot_value_in_double_colon_del_list($_, "r");
|
72 |
+
$numeric = $util->slot_value_in_double_colon_del_list($_, "num");
|
73 |
+
$picture = $util->slot_value_in_double_colon_del_list($_, "pic");
|
74 |
+
$syllable_info = $util->slot_value_in_double_colon_del_list($_, "syllable-info");
|
75 |
+
$tone_mark = $util->slot_value_in_double_colon_del_list($_, "tone-mark");
|
76 |
+
$char_name = $util->slot_value_in_double_colon_del_list($_, "name");
|
77 |
+
$entry_processed_p = 0;
|
78 |
+
$utf8_code = $utf8->unicode_hex_string2string($unicode_value);
|
79 |
+
if ($unicode_value) {
|
80 |
+
$ht{UTF_TO_CHAR_ROMANIZATION}->{$utf8_code} = $romanization if $romanization;
|
81 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric if defined($numeric) && ($numeric ne "");
|
82 |
+
$ht{UTF_TO_PICTURE_DESCR}->{$utf8_code} = $picture if $picture;
|
83 |
+
$ht{UTF_TO_SYLLABLE_INFO}->{$utf8_code} = $syllable_info if $syllable_info;
|
84 |
+
$ht{UTF_TO_TONE_MARK}->{$utf8_code} = $tone_mark if $tone_mark;
|
85 |
+
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name if $char_name;
|
86 |
+
$entry_processed_p = 1 if $romanization || $numeric || $picture || $syllable_info || $tone_mark;
|
87 |
+
}
|
88 |
+
$n++ if $entry_processed_p;
|
89 |
+
}
|
90 |
+
close(IN);
|
91 |
+
} else {
|
92 |
+
print STDERR "Can't open $filename\n";
|
93 |
+
}
|
94 |
+
}
|
95 |
+
|
96 |
+
sub load_script_data {
|
97 |
+
local($this, *ht, $filename) = @_;
|
98 |
+
# ../../data/Scripts.txt
|
99 |
+
|
100 |
+
$n = 0;
|
101 |
+
if (open(IN, $filename)) {
|
102 |
+
while (<IN>) {
|
103 |
+
next unless $script_name = $util->slot_value_in_double_colon_del_list($_, "script-name");
|
104 |
+
$abugida_default_vowel_s = $util->slot_value_in_double_colon_del_list($_, "abugida-default-vowel");
|
105 |
+
$alt_script_name_s = $util->slot_value_in_double_colon_del_list($_, "alt-script-name");
|
106 |
+
$language_s = $util->slot_value_in_double_colon_del_list($_, "language");
|
107 |
+
$direction = $util->slot_value_in_double_colon_del_list($_, "direction"); # right-to-left
|
108 |
+
$font_family_s = $util->slot_value_in_double_colon_del_list($_, "font-family");
|
109 |
+
$ht{SCRIPT_P}->{$script_name} = 1;
|
110 |
+
$ht{SCRIPT_NORM}->{(uc $script_name)} = $script_name;
|
111 |
+
$ht{DIRECTION}->{$script_name} = $direction if $direction;
|
112 |
+
foreach $language (split(/,\s*/, $language_s)) {
|
113 |
+
$ht{SCRIPT_LANGUAGE}->{$script_name}->{$language} = 1;
|
114 |
+
$ht{LANGUAGE_SCRIPT}->{$language}->{$script_name} = 1;
|
115 |
+
}
|
116 |
+
foreach $alt_script_name (split(/,\s*/, $alt_script_name_s)) {
|
117 |
+
$ht{SCRIPT_NORM}->{$alt_script_name} = $script_name;
|
118 |
+
$ht{SCRIPT_NORM}->{(uc $alt_script_name)} = $script_name;
|
119 |
+
}
|
120 |
+
foreach $abugida_default_vowel (split(/,\s*/, $abugida_default_vowel_s)) {
|
121 |
+
$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$script_name}->{$abugida_default_vowel} = 1 if $abugida_default_vowel;
|
122 |
+
}
|
123 |
+
foreach $font_family (split(/,\s*/, $font_family_s)) {
|
124 |
+
$ht{SCRIPT_FONT}->{$script_name}->{$font_family} = 1 if $font_family;
|
125 |
+
}
|
126 |
+
$n++;
|
127 |
+
}
|
128 |
+
close(IN);
|
129 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
130 |
+
} else {
|
131 |
+
print STDERR "Can't open $filename\n";
|
132 |
+
}
|
133 |
+
}
|
134 |
+
|
135 |
+
sub unicode_hangul_romanization {
|
136 |
+
local($this, $s, $pass_through_p) = @_;
|
137 |
+
|
138 |
+
$pass_through_p = 0 unless defined($pass_through_p);
|
139 |
+
@leads = split(/\s+/, "g gg n d dd r m b bb s ss - j jj c k t p h");
|
140 |
+
# @vowels = split(/\s+/, "a ae ya yai e ei ye yei o oa oai oi yo u ue uei ui yu w wi i");
|
141 |
+
@vowels = split(/\s+/, "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i");
|
142 |
+
@tails = split(/\s+/, "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h");
|
143 |
+
$result = "";
|
144 |
+
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
145 |
+
foreach $char (@chars) {
|
146 |
+
$unicode = $utf8->utf8_to_unicode($char);
|
147 |
+
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
148 |
+
$code = $unicode - 0xAC00;
|
149 |
+
$lead_index = int($code / (28*21));
|
150 |
+
$vowel_index = int($code/28) % 21;
|
151 |
+
$tail_index = $code % 28;
|
152 |
+
$rom = $leads[$lead_index] . $vowels[$vowel_index] . $tails[$tail_index];
|
153 |
+
$rom =~ s/-//g;
|
154 |
+
$result .= $rom;
|
155 |
+
} elsif ($pass_through_p) {
|
156 |
+
$result .= $char;
|
157 |
+
}
|
158 |
+
}
|
159 |
+
return $result;
|
160 |
+
}
|
161 |
+
|
162 |
+
sub listify_comma_sep_string {
|
163 |
+
local($this, $s) = @_;
|
164 |
+
|
165 |
+
@result_list = ();
|
166 |
+
return @result_list unless $s =~ /\S/;
|
167 |
+
$s = $util->trim2($s);
|
168 |
+
my $elem;
|
169 |
+
|
170 |
+
while (($elem, $rest) = ($s =~ /^("(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^"', ]+),\s*(.*)$/)) {
|
171 |
+
push(@result_list, $util->dequote_string($elem));
|
172 |
+
$s = $rest;
|
173 |
+
}
|
174 |
+
push(@result_list, $util->dequote_string($s)) if $s =~ /\S/;
|
175 |
+
|
176 |
+
return @result_list;
|
177 |
+
}
|
178 |
+
|
179 |
+
sub braille_string_p {
|
180 |
+
local($this, $s) = @_;
|
181 |
+
|
182 |
+
return ($s =~ /^(\xE2[\xA0-\xA3][\x80-\xBF])+$/);
|
183 |
+
}
|
184 |
+
|
185 |
+
sub register_word_boundary_info {
|
186 |
+
local($this, *ht, $lang_code, $utf8_source_string, $utf8_target_string, $use_only_for_whole_word_p,
|
187 |
+
$use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
188 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p) = @_;
|
189 |
+
|
190 |
+
if ($use_only_for_whole_word_p) {
|
191 |
+
if ($lang_code) {
|
192 |
+
$ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
193 |
+
} else {
|
194 |
+
$ht{USE_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
195 |
+
}
|
196 |
+
}
|
197 |
+
if ($use_only_at_start_of_word_p) {
|
198 |
+
if ($lang_code) {
|
199 |
+
$ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
200 |
+
} else {
|
201 |
+
$ht{USE_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
202 |
+
}
|
203 |
+
}
|
204 |
+
if ($use_only_at_end_of_word_p) {
|
205 |
+
if ($lang_code) {
|
206 |
+
$ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
207 |
+
} else {
|
208 |
+
$ht{USE_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
209 |
+
}
|
210 |
+
}
|
211 |
+
if ($dont_use_at_start_of_word_p) {
|
212 |
+
if ($lang_code) {
|
213 |
+
$ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
214 |
+
} else {
|
215 |
+
$ht{DONT_USE_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
216 |
+
}
|
217 |
+
}
|
218 |
+
if ($dont_use_at_end_of_word_p) {
|
219 |
+
if ($lang_code) {
|
220 |
+
$ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
221 |
+
} else {
|
222 |
+
$ht{DONT_USE_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
223 |
+
}
|
224 |
+
}
|
225 |
+
}
|
226 |
+
|
227 |
+
sub load_romanization_table {
|
228 |
+
local($this, *ht, $filename) = @_;
|
229 |
+
# ../../data/romanization-table.txt
|
230 |
+
|
231 |
+
$n = 0;
|
232 |
+
$line_number = 0;
|
233 |
+
if (open(IN, $filename)) {
|
234 |
+
while (<IN>) {
|
235 |
+
$line_number++;
|
236 |
+
next if /^#/;
|
237 |
+
if ($_ =~ /^::preserve\s/) {
|
238 |
+
$from_unicode = $util->slot_value_in_double_colon_del_list($_, "from");
|
239 |
+
$to_unicode = $util->slot_value_in_double_colon_del_list($_, "to");
|
240 |
+
if ($from_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
|
241 |
+
$from_unicode =~ s/^(?:U\+|\\u)//;
|
242 |
+
$from_code_point = hex($from_unicode);
|
243 |
+
} else {
|
244 |
+
$from_code_point = "";
|
245 |
+
}
|
246 |
+
if ($to_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
|
247 |
+
$to_unicode =~ s/^(?:U\+|\\u)//;
|
248 |
+
$to_code_point = hex($to_unicode);
|
249 |
+
} else {
|
250 |
+
$to_code_point = $from_code_point;
|
251 |
+
}
|
252 |
+
if ($from_code_point ne "") {
|
253 |
+
# print STDERR "Preserve code-points $from_unicode--$to_unicode = $from_code_point--$to_code_point\n";
|
254 |
+
foreach $code_point (($from_code_point .. $to_code_point)) {
|
255 |
+
$utf8_string = $utf8->unicode2string($code_point);
|
256 |
+
$ht{UTF_CHAR_MAPPING}->{$utf8_string}->{$utf8_string} = 1;
|
257 |
+
}
|
258 |
+
$n++;
|
259 |
+
}
|
260 |
+
next;
|
261 |
+
}
|
262 |
+
$utf8_source_string = $util->slot_value_in_double_colon_del_list($_, "s");
|
263 |
+
$utf8_target_string = $util->slot_value_in_double_colon_del_list($_, "t");
|
264 |
+
$utf8_alt_target_string_s = $util->slot_value_in_double_colon_del_list($_, "t-alt");
|
265 |
+
$use_alt_in_pointed_p = ($_ =~ /::use-alt-in-pointed\b/);
|
266 |
+
$use_only_for_whole_word_p = ($_ =~ /::use-only-for-whole-word\b/);
|
267 |
+
$use_only_at_start_of_word_p = ($_ =~ /::use-only-at-start-of-word\b/);
|
268 |
+
$use_only_at_end_of_word_p = ($_ =~ /::use-only-at-end-of-word\b/);
|
269 |
+
$dont_use_at_start_of_word_p = ($_ =~ /::dont-use-at-start-of-word\b/);
|
270 |
+
$dont_use_at_end_of_word_p = ($_ =~ /::dont-use-at-end-of-word\b/);
|
271 |
+
$use_only_in_lower_case_enviroment_p = ($_ =~ /::use-only-in-lower-case-enviroment\b/);
|
272 |
+
$word_external_punctuation_p = ($_ =~ /::word-external-punctuation\b/);
|
273 |
+
$utf8_source_string =~ s/\s*$//;
|
274 |
+
$utf8_target_string =~ s/\s*$//;
|
275 |
+
$utf8_alt_target_string_s =~ s/\s*$//;
|
276 |
+
$utf8_target_string =~ s/^"(.*)"$/$1/;
|
277 |
+
$utf8_target_string =~ s/^'(.*)'$/$1/;
|
278 |
+
@utf8_alt_targets = $this->listify_comma_sep_string($utf8_alt_target_string_s);
|
279 |
+
$numeric = $util->slot_value_in_double_colon_del_list($_, "num");
|
280 |
+
$numeric =~ s/\s*$//;
|
281 |
+
$annotation = $util->slot_value_in_double_colon_del_list($_, "annotation");
|
282 |
+
$annotation =~ s/\s*$//;
|
283 |
+
$lang_code = $util->slot_value_in_double_colon_del_list($_, "lcode");
|
284 |
+
$prob = $util->slot_value_in_double_colon_del_list($_, "p") || 1;
|
285 |
+
unless (($utf8_target_string eq "") && ($numeric =~ /\d/)) {
|
286 |
+
if ($lang_code) {
|
287 |
+
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
288 |
+
} else {
|
289 |
+
$ht{UTF_CHAR_MAPPING}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
290 |
+
}
|
291 |
+
if ($word_external_punctuation_p) {
|
292 |
+
if ($lang_code) {
|
293 |
+
$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
294 |
+
} else {
|
295 |
+
$ht{WORD_EXTERNAL_PUNCTUATION}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
296 |
+
}
|
297 |
+
}
|
298 |
+
if ($this->braille_string_p($utf8_source_string)) {
|
299 |
+
if (($utf8_target_string =~ /^[a-z]+$/)
|
300 |
+
&& (! ($utf8_source_string =~ /^$braille_capital_letter_indicator/))) {
|
301 |
+
my $uc_utf8_source_string = "$braille_capital_letter_indicator$utf8_source_string";
|
302 |
+
my $uc_utf8_target_string = ucfirst $utf8_target_string;
|
303 |
+
if ($lang_code) {
|
304 |
+
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
|
305 |
+
} else {
|
306 |
+
$ht{UTF_CHAR_MAPPING}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
|
307 |
+
}
|
308 |
+
$this->register_word_boundary_info(*ht, $lang_code, $uc_utf8_source_string, $uc_utf8_target_string,
|
309 |
+
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
310 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
|
311 |
+
}
|
312 |
+
if (($utf8_target_string =~ /^[0-9]$/)
|
313 |
+
&& ($utf8_source_string =~ /^$braille_number_indicator./)) {
|
314 |
+
my $core_number_char = $utf8_source_string;
|
315 |
+
$core_number_char =~ s/$braille_number_indicator//;
|
316 |
+
$ht{BRAILLE_TO_DIGIT}->{$core_number_char} = $utf8_target_string;
|
317 |
+
}
|
318 |
+
}
|
319 |
+
}
|
320 |
+
if ($use_only_in_lower_case_enviroment_p) {
|
321 |
+
if ($lang_code) {
|
322 |
+
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
323 |
+
} else {
|
324 |
+
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
325 |
+
}
|
326 |
+
}
|
327 |
+
$this->register_word_boundary_info(*ht, $lang_code, $utf8_source_string, $utf8_target_string,
|
328 |
+
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
329 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
|
330 |
+
foreach $utf8_alt_target (@utf8_alt_targets) {
|
331 |
+
if ($lang_code) {
|
332 |
+
$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
|
333 |
+
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
|
334 |
+
} else {
|
335 |
+
$ht{UTF_CHAR_ALT_MAPPING}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
|
336 |
+
$ht{USE_ALT_IN_POINTED}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
|
337 |
+
}
|
338 |
+
if ($use_only_for_whole_word_p) {
|
339 |
+
if ($lang_code) {
|
340 |
+
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
341 |
+
} else {
|
342 |
+
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
343 |
+
}
|
344 |
+
}
|
345 |
+
if ($use_only_at_start_of_word_p) {
|
346 |
+
if ($lang_code) {
|
347 |
+
$ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
348 |
+
} else {
|
349 |
+
$ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
350 |
+
}
|
351 |
+
}
|
352 |
+
if ($use_only_at_end_of_word_p) {
|
353 |
+
if ($lang_code) {
|
354 |
+
$ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
355 |
+
} else {
|
356 |
+
$ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
357 |
+
}
|
358 |
+
}
|
359 |
+
}
|
360 |
+
if ($numeric =~ /\d/) {
|
361 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_source_string} = $numeric;
|
362 |
+
}
|
363 |
+
if ($annotation =~ /\S/) {
|
364 |
+
$ht{UTF_ANNOTATION}->{$utf8_source_string} = $annotation;
|
365 |
+
}
|
366 |
+
$n++;
|
367 |
+
}
|
368 |
+
close(IN);
|
369 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
370 |
+
} else {
|
371 |
+
print STDERR "Can't open $filename\n";
|
372 |
+
}
|
373 |
+
}
|
374 |
+
|
375 |
+
sub char_name_to_script {
|
376 |
+
local($this, $char_name, *ht) = @_;
|
377 |
+
|
378 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_TO_SCRIPT}->{$char_name};
|
379 |
+
$orig_char_name = $char_name;
|
380 |
+
$char_name =~ s/\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL)\b.*$//;
|
381 |
+
my $script_name;
|
382 |
+
while ($char_name) {
|
383 |
+
last if $script_name = $ht{SCRIPT_NORM}->{(uc $char_name)};
|
384 |
+
$char_name =~ s/\s*\S+\s*$//;
|
385 |
+
}
|
386 |
+
$script_name = "" unless defined($script_name);
|
387 |
+
$ht{CHAR_NAME_TO_SCRIPT}->{$char_name} = $script_name;
|
388 |
+
return $script_name;
|
389 |
+
}
|
390 |
+
|
391 |
+
sub letter_plus_char_p {
|
392 |
+
local($this, $char_name) = @_;
|
393 |
+
|
394 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_LETTER_PLUS}->{$char_name};
|
395 |
+
my $letter_plus_p = ($char_name =~ /\b(?:LETTER|VOWEL SIGN|AU LENGTH MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN AL-LAKUNA|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN NUKTA|SIGN DOT BELOW|HEBREW POINT)\b/) ? 1 : 0;
|
396 |
+
$ht{CHAR_NAME_LETTER_PLUS}->{$char_name} = $letter_plus_p;
|
397 |
+
return $letter_plus_p;
|
398 |
+
}
|
399 |
+
|
400 |
+
sub subjoined_char_p {
|
401 |
+
local($this, $char_name) = @_;
|
402 |
+
|
403 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_SUBJOINED}->{$char_name};
|
404 |
+
my $subjoined_p = (($char_name =~ /\b(?:SUBJOINED LETTER|VOWEL SIGN|AU LENGTH MARK|EMPHASIS MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN DOT BELOW|HEBREW (POINT|PUNCTUATION GERESH)|ARABIC (?:DAMMA|DAMMATAN|FATHA|FATHATAN|HAMZA|KASRA|KASRATAN|MADDAH|SHADDA|SUKUN))\b/)) ? 1 : 0;
|
405 |
+
$ht{CHAR_NAME_SUBJOINED}->{$char_name} = $subjoined_p;
|
406 |
+
return $subjoined_p;
|
407 |
+
}
|
408 |
+
|
409 |
+
sub new_node_id {
|
410 |
+
local($this, *chart_ht) = @_;
|
411 |
+
|
412 |
+
my $n_nodes = $chart_ht{N_NODES};
|
413 |
+
$n_nodes++;
|
414 |
+
$chart_ht{N_NODES} = $n_nodes;
|
415 |
+
return $n_nodes;
|
416 |
+
}
|
417 |
+
|
418 |
+
sub add_node {
|
419 |
+
local($this, $s, $start, $end, *chart_ht, $type, $comment) = @_;
|
420 |
+
|
421 |
+
my $node_id = $this->new_node_id(*chart_ht);
|
422 |
+
# print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if $comment =~ /number/;
|
423 |
+
# print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if ($start >= 0) && ($start < 50);
|
424 |
+
$chart_ht{NODE_START}->{$node_id} = $start;
|
425 |
+
$chart_ht{NODE_END}->{$node_id} = $end;
|
426 |
+
$chart_ht{NODES_STARTING_AT}->{$start}->{$node_id} = 1;
|
427 |
+
$chart_ht{NODES_ENDING_AT}->{$end}->{$node_id} = 1;
|
428 |
+
$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}->{$node_id} = 1;
|
429 |
+
$chart_ht{NODE_TYPE}->{$node_id} = $type;
|
430 |
+
$chart_ht{NODE_COMMENT}->{$node_id} = $comment;
|
431 |
+
$chart_ht{NODE_ROMAN}->{$node_id} = $s;
|
432 |
+
return $node_id;
|
433 |
+
}
|
434 |
+
|
435 |
+
sub get_node_for_span {
|
436 |
+
local($this, $start, $end, *chart_ht) = @_;
|
437 |
+
|
438 |
+
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
439 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
440 |
+
|
441 |
+
return (@node_ids) ? $node_ids[0] : "";
|
442 |
+
}
|
443 |
+
|
444 |
+
sub get_node_for_span_and_type {
|
445 |
+
local($this, $start, $end, *chart_ht, $type) = @_;
|
446 |
+
|
447 |
+
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
448 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
449 |
+
|
450 |
+
foreach $node_id (@node_ids) {
|
451 |
+
return $node_id if $chart_ht{NODE_TYPE}->{$node_id} eq $type;
|
452 |
+
}
|
453 |
+
return "";
|
454 |
+
}
|
455 |
+
|
456 |
+
sub get_node_roman {
|
457 |
+
local($this, $node_id, *chart_id, $default) = @_;
|
458 |
+
|
459 |
+
$default = "" unless defined($default);
|
460 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
461 |
+
return (defined($roman)) ? $roman : $default;
|
462 |
+
}
|
463 |
+
|
464 |
+
sub set_node_id_slot_value {
|
465 |
+
local($this, $node_id, $slot, $value, *chart_id) = @_;
|
466 |
+
|
467 |
+
$chart_ht{NODE_SLOT}->{$node_id}->{$slot} = $value;
|
468 |
+
}
|
469 |
+
|
470 |
+
sub copy_slot_values {
|
471 |
+
local($this, $old_node_id, $new_node_id, *chart_id, @slots) = @_;
|
472 |
+
|
473 |
+
if (@slots) {
|
474 |
+
foreach $slot (keys %{$chart_ht{NODE_SLOT}->{$old_node_id}}) {
|
475 |
+
if (($slots[0] eq "all") || $util->member($slot, @slots)) {
|
476 |
+
my $value = $chart_ht{NODE_SLOT}->{$old_node_id}->{$slot};
|
477 |
+
$chart_ht{NODE_SLOT}->{$new_node_id}->{$slot} = $value if defined($value);
|
478 |
+
}
|
479 |
+
}
|
480 |
+
}
|
481 |
+
}
|
482 |
+
|
483 |
+
sub get_node_id_slot_value {
|
484 |
+
local($this, $node_id, $slot, *chart_id, $default) = @_;
|
485 |
+
|
486 |
+
$default = "" unless defined($default);
|
487 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
488 |
+
return (defined($value)) ? $value : $default;
|
489 |
+
}
|
490 |
+
|
491 |
+
sub get_node_for_span_with_slot_value {
|
492 |
+
local($this, $start, $end, $slot, *chart_id, $default) = @_;
|
493 |
+
|
494 |
+
$default = "" unless defined($default);
|
495 |
+
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
496 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
497 |
+
foreach $node_id (@node_ids) {
|
498 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
499 |
+
return $value if defined($value);
|
500 |
+
}
|
501 |
+
return $default;
|
502 |
+
}
|
503 |
+
|
504 |
+
sub get_node_for_span_with_slot {
|
505 |
+
local($this, $start, $end, $slot, *chart_id, $default) = @_;
|
506 |
+
|
507 |
+
$default = "" unless defined($default);
|
508 |
+
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
509 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
510 |
+
foreach $node_id (@node_ids) {
|
511 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
512 |
+
return $node_id if defined($value);
|
513 |
+
}
|
514 |
+
return $default;
|
515 |
+
}
|
516 |
+
|
517 |
+
sub register_new_complex_number_span_segment {
|
518 |
+
local($this, $start, $mid, $end, *chart_id, $line_number) = @_;
|
519 |
+
# e.g. 4 10 (= 40); 20 5 (= 25)
|
520 |
+
# might become part of larger complex number span, e.g. 4 1000 3 100 20 1
|
521 |
+
|
522 |
+
# print STDERR "register_new_complex_number_span_segment $start-$mid-$end\n" if $line_number == 43;
|
523 |
+
if (defined($old_start = $chart_ht{COMPLEX_NUMERIC_END_START}->{$mid})) {
|
524 |
+
undef($chart_ht{COMPLEX_NUMERIC_END_START}->{$mid});
|
525 |
+
$chart_ht{COMPLEX_NUMERIC_START_END}->{$old_start} = $end;
|
526 |
+
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $old_start;
|
527 |
+
} else {
|
528 |
+
$chart_ht{COMPLEX_NUMERIC_START_END}->{$start} = $end;
|
529 |
+
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $start;
|
530 |
+
}
|
531 |
+
}
|
532 |
+
|
533 |
+
sub romanize_by_token_with_caching {
|
534 |
+
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) = @_;
|
535 |
+
|
536 |
+
$control = "" unless defined($control);
|
537 |
+
my $return_chart_p = ($control =~ /return chart/i);
|
538 |
+
my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
|
539 |
+
return $this->romanize($s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number)
|
540 |
+
if $return_chart_p || $return_offset_mappings_p;
|
541 |
+
my $result = "";
|
542 |
+
my @separators = ();
|
543 |
+
my @tokens = ();
|
544 |
+
$s =~ s/\n$//; # Added May 2, 2019 as bug-fix (duplicate empty lines)
|
545 |
+
while (($sep, $token, $rest) = ($s =~ /^(\s*)(\S+)(.*)$/)) {
|
546 |
+
push(@separators, $sep);
|
547 |
+
push(@tokens, $token);
|
548 |
+
$s = $rest;
|
549 |
+
}
|
550 |
+
push(@separators, $s);
|
551 |
+
while (@tokens) {
|
552 |
+
my $sep = shift @separators;
|
553 |
+
my $token = shift @tokens;
|
554 |
+
$result .= $sep;
|
555 |
+
if ($token =~ /^[\x00-\x7F]*$/) { # all ASCII
|
556 |
+
$result .= $token;
|
557 |
+
} else {
|
558 |
+
my $rom_token = $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token};
|
559 |
+
unless (defined($rom_token)) {
|
560 |
+
$rom_token = $this->romanize($token, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number);
|
561 |
+
$ht{CACHED_ROMANIZATION}->{$lang_code}->{$token} = $rom_token if defined($rom_token);
|
562 |
+
}
|
563 |
+
$result .= $rom_token;
|
564 |
+
}
|
565 |
+
}
|
566 |
+
my $sep = shift @separators;
|
567 |
+
$result .= $sep if defined($sep);
|
568 |
+
|
569 |
+
return $result;
|
570 |
+
}
|
571 |
+
|
572 |
+
sub romanize {
|
573 |
+
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number, $initial_rom_char_offset) = @_;
|
574 |
+
|
575 |
+
my $orig_lang_code = $lang_code;
|
576 |
+
# Check whether the text (to be romanized) starts with a language code directive.
|
577 |
+
if (($line_lang_code) = ($s =~ /^::lcode\s+([a-z][a-z][a-z])\s/)) {
|
578 |
+
$lang_code = $line_lang_code;
|
579 |
+
}
|
580 |
+
$initial_char_offset = 0 unless defined($initial_char_offset);
|
581 |
+
$initial_rom_char_offset = 0 unless defined($initial_rom_char_offset);
|
582 |
+
$control = "" unless defined($control);
|
583 |
+
my $return_chart_p = ($control =~ /return chart/i);
|
584 |
+
my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
|
585 |
+
$line_number = "" unless defined($line_number);
|
586 |
+
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
587 |
+
my $n_characters = $#chars + 1;
|
588 |
+
%chart_ht = ();
|
589 |
+
$chart_ht{N_CHARS} = $n_characters;
|
590 |
+
$chart_ht{N_NODES} = 0;
|
591 |
+
my $char = "";
|
592 |
+
my $char_name = "";
|
593 |
+
my $prev_script = "";
|
594 |
+
my $current_script = "";
|
595 |
+
my $script_start = 0;
|
596 |
+
my $script_end = 0;
|
597 |
+
my $prev_letter_plus_script = "";
|
598 |
+
my $current_letter_plus_script = "";
|
599 |
+
my $letter_plus_script_start = 0;
|
600 |
+
my $letter_plus_script_end = 0;
|
601 |
+
my $log ="";
|
602 |
+
my $n_right_to_left_chars = 0;
|
603 |
+
my $n_left_to_right_chars = 0;
|
604 |
+
my $hebrew_word_start = ""; # used to identify Hebrew words with points
|
605 |
+
my $hebrew_word_contains_point = 0;
|
606 |
+
my $current_word_start = "";
|
607 |
+
my $current_word_script = "";
|
608 |
+
my $braille_all_caps_p = 0;
|
609 |
+
|
610 |
+
# prep
|
611 |
+
foreach $i ((0 .. ($#chars + 1))) {
|
612 |
+
if ($i <= $#chars) {
|
613 |
+
$char = $chars[$i];
|
614 |
+
$chart_ht{ORIG_CHAR}->{$i} = $char;
|
615 |
+
$char_name = $ht{UTF_TO_CHAR_NAME}->{$char} || "";
|
616 |
+
$chart_ht{CHAR_NAME}->{$i} = $char_name;
|
617 |
+
$current_script = $this->char_name_to_script($char_name, *ht);
|
618 |
+
$current_script_direction = $ht{DIRECTION}->{$current_script} || '';
|
619 |
+
if ($current_script_direction eq 'right-to-left') {
|
620 |
+
$n_right_to_left_chars++;
|
621 |
+
} elsif (($char =~ /^[a-z]$/i) || ! ($char =~ /^[\x00-\x7F]$/)) {
|
622 |
+
$n_left_to_right_chars++;
|
623 |
+
}
|
624 |
+
$chart_ht{CHAR_SCRIPT}->{$i} = $current_script;
|
625 |
+
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = ""; # default value, to be updated later
|
626 |
+
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = ""; # default value, to be updated later
|
627 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = ""; # default value, to be updated later
|
628 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = ""; # default value, to be updated later
|
629 |
+
$subjoined_char_p = $this->subjoined_char_p($char_name);
|
630 |
+
$chart_ht{CHAR_SUBJOINED}->{$i} = $subjoined_char_p;
|
631 |
+
$letter_plus_char_p = $this->letter_plus_char_p($char_name);
|
632 |
+
$chart_ht{CHAR_LETTER_PLUS}->{$i} = $letter_plus_char_p;
|
633 |
+
$current_letter_plus_script = ($letter_plus_char_p) ? $current_script : "";
|
634 |
+
$numeric_value = $ht{UTF_TO_NUMERIC}->{$char};
|
635 |
+
$numeric_value = "" unless defined($numeric_value);
|
636 |
+
$annotation = $ht{UTF_ANNOTATION}->{$char};
|
637 |
+
$annotation = "" unless defined($annotation);
|
638 |
+
$chart_ht{CHAR_NUMERIC_VALUE}->{$i} = $numeric_value;
|
639 |
+
$chart_ht{CHAR_ANNOTATION}->{$i} = $annotation;
|
640 |
+
$syllable_info = $ht{UTF_TO_SYLLABLE_INFO}->{$char} || "";
|
641 |
+
$chart_ht{CHAR_SYLLABLE_INFO}->{$i} = $syllable_info;
|
642 |
+
$tone_mark = $ht{UTF_TO_TONE_MARK}->{$char} || "";
|
643 |
+
$chart_ht{CHAR_TONE_MARK}->{$i} = $tone_mark;
|
644 |
+
} else {
|
645 |
+
$char = "";
|
646 |
+
$char_name = "";
|
647 |
+
$current_script = "";
|
648 |
+
$current_letter_plus_script = "";
|
649 |
+
}
|
650 |
+
if ($char_name =~ /^HEBREW (LETTER|POINT|PUNCTUATION GERESH) /) {
|
651 |
+
$hebrew_word_start = $i if $hebrew_word_start eq "";
|
652 |
+
$hebrew_word_contains_point = 1 if $char_name =~ /^HEBREW POINT /;
|
653 |
+
} elsif ($hebrew_word_start ne "") {
|
654 |
+
if ($hebrew_word_contains_point) {
|
655 |
+
foreach $j (($hebrew_word_start .. ($i-1))) {
|
656 |
+
$chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$j} = 1;
|
657 |
+
}
|
658 |
+
$chart_ht{CHAR_START_OF_WORD}->{$hebrew_word_start} = 1;
|
659 |
+
$chart_ht{CHAR_END_OF_WORD}->{($i-1)} = 1;
|
660 |
+
}
|
661 |
+
$hebrew_word_start = "";
|
662 |
+
$hebrew_word_contains_point = 0;
|
663 |
+
}
|
664 |
+
my $part_of_word_p = $current_script
|
665 |
+
&& ($this->letter_plus_char_p($char_name)
|
666 |
+
|| $this->subjoined_char_p($char_name)
|
667 |
+
|| ($char_name =~ /\b(LETTER|SYLLABLE|SYLLABICS|LIGATURE)\b/));
|
668 |
+
|
669 |
+
# Braille punctuation
|
670 |
+
my $end_offset = 0;
|
671 |
+
if ($char_name =~ /^Braille\b/i) {
|
672 |
+
if (($char =~ /^\s*$/) || ($char_name =~ /BLANK/)) {
|
673 |
+
$part_of_word_p = 0;
|
674 |
+
$braille_all_caps_p = 0;
|
675 |
+
} elsif ($chart_ht{NOT_PART_OF_WORD_P}->{$i}) {
|
676 |
+
$part_of_word_p = 0;
|
677 |
+
$braille_all_caps_p = 0;
|
678 |
+
} elsif ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$char}})
|
679 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$char}})) {
|
680 |
+
$part_of_word_p = 0;
|
681 |
+
$braille_all_caps_p = 0;
|
682 |
+
} elsif (($i+1 <= $#chars)
|
683 |
+
&& ($s1 = $char . $chars[$i+1])
|
684 |
+
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s1}})
|
685 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s1}}))) {
|
686 |
+
$part_of_word_p = 0;
|
687 |
+
$braille_all_caps_p = 0;
|
688 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
|
689 |
+
} elsif (($i+2 <= $#chars)
|
690 |
+
&& ($s2 = $char . $chars[$i+1] . $chars[$i+2])
|
691 |
+
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s2}})
|
692 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s2}}))) {
|
693 |
+
$part_of_word_p = 0;
|
694 |
+
$braille_all_caps_p = 0;
|
695 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
|
696 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+2)} = 1;
|
697 |
+
} elsif (($i+1 <= $#chars)
|
698 |
+
&& ($char eq $braille_capital_letter_indicator)
|
699 |
+
&& ($chars[$i+1] eq $braille_capital_letter_indicator)) {
|
700 |
+
$braille_all_caps_p = 1;
|
701 |
+
} else {
|
702 |
+
$part_of_word_p = 1;
|
703 |
+
}
|
704 |
+
# last period in Braille text is also not part_of_word_p
|
705 |
+
if (($char eq $braille_period)
|
706 |
+
&& (($i == $#chars)
|
707 |
+
|| (($i < $#chars)
|
708 |
+
&& (! $this->braille_string_p($chars[$i+1]))))) {
|
709 |
+
$part_of_word_p = 0;
|
710 |
+
}
|
711 |
+
# period before other word-external punctuation is also not part_of_word_p
|
712 |
+
if (($i > 0)
|
713 |
+
&& ($chars[$i-1] eq $braille_period)
|
714 |
+
&& (! $part_of_word_p)
|
715 |
+
&& ($current_word_start ne "")) {
|
716 |
+
$end_offset = -1;
|
717 |
+
}
|
718 |
+
} else {
|
719 |
+
$braille_all_caps_p = 0;
|
720 |
+
}
|
721 |
+
$chart_ht{BRAILLE_ALL_CAPS_P}->{$i} = $braille_all_caps_p;
|
722 |
+
|
723 |
+
if (($current_word_start ne "")
|
724 |
+
&& ((! $part_of_word_p)
|
725 |
+
|| ($current_script ne $current_word_script))) {
|
726 |
+
# END OF WORD
|
727 |
+
$chart_ht{CHAR_START_OF_WORD}->{$current_word_start} = 1;
|
728 |
+
$chart_ht{CHAR_END_OF_WORD}->{($i-1+$end_offset)} = 1;
|
729 |
+
my $word = join("", @chars[$current_word_start .. ($i-1+$end_offset)]);
|
730 |
+
$chart_ht{WORD_START_END}->{$current_word_start}->{$i} = $word;
|
731 |
+
$chart_ht{WORD_END_START}->{$i+$end_offset}->{$current_word_start} = $word;
|
732 |
+
# print STDERR "Word ($current_word_start-$i+$end_offset): $word ($current_word_script)\n";
|
733 |
+
$current_word_start = "";
|
734 |
+
$current_word_script = "";
|
735 |
+
}
|
736 |
+
if ($part_of_word_p && ($current_word_start eq "")) {
|
737 |
+
# START OF WORD
|
738 |
+
$current_word_start = $i;
|
739 |
+
$current_word_script = $current_script;
|
740 |
+
}
|
741 |
+
# print STDERR "$i char: $char ($current_script)\n";
|
742 |
+
unless ($current_script eq $prev_script) {
|
743 |
+
if ($prev_script && ($i-1 >= $script_start)) {
|
744 |
+
my $script_end = $i;
|
745 |
+
$chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start} = $script_end;
|
746 |
+
$chart_ht{SCRIPT_SEGMENT_END_TO_START}->{$script_end} = $script_start;
|
747 |
+
foreach $i (($script_start .. $script_end)) {
|
748 |
+
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = $script_start;
|
749 |
+
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = $script_end;
|
750 |
+
}
|
751 |
+
# print STDERR "Script segment $script_start-$script_end: $prev_script\n";
|
752 |
+
}
|
753 |
+
$script_start = $i;
|
754 |
+
}
|
755 |
+
unless ($current_letter_plus_script eq $prev_letter_plus_script) {
|
756 |
+
if ($prev_letter_plus_script && ($i-1 >= $letter_plus_script_start)) {
|
757 |
+
my $letter_plus_script_end = $i;
|
758 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$letter_plus_script_start} = $letter_plus_script_end;
|
759 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END_TO_START}->{$letter_plus_script_end} = $letter_plus_script_start;
|
760 |
+
foreach $i (($letter_plus_script_start .. $letter_plus_script_end)) {
|
761 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = $letter_plus_script_start;
|
762 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = $letter_plus_script_end;
|
763 |
+
}
|
764 |
+
# print STDERR "Script token segment $letter_plus_script_start-$letter_plus_script_end: $prev_letter_plus_script\n";
|
765 |
+
}
|
766 |
+
$letter_plus_script_start = $i;
|
767 |
+
}
|
768 |
+
$prev_script = $current_script;
|
769 |
+
$prev_letter_plus_script = $current_letter_plus_script;
|
770 |
+
}
|
771 |
+
$ht{STRING_IS_DOMINANTLY_RIGHT_TO_LEFT}->{$s} = 1 if $n_right_to_left_chars > $n_left_to_right_chars;
|
772 |
+
|
773 |
+
# main
|
774 |
+
my $i = 0;
|
775 |
+
while ($i <= $#chars) {
|
776 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
777 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$i};
|
778 |
+
$chart_ht{CHART_CONTAINS_SCRIPT}->{$current_script} = 1;
|
779 |
+
my $script_segment_start = $chart_ht{SCRIPT_SEGMENT_START}->{$i};
|
780 |
+
my $script_segment_end = $chart_ht{SCRIPT_SEGMENT_END}->{$i};
|
781 |
+
my $char_name = $chart_ht{CHAR_NAME}->{$i};
|
782 |
+
my $subjoined_char_p = $chart_ht{CHAR_SUBJOINED}->{$i};
|
783 |
+
my $letter_plus_char_p = $chart_ht{CHAR_LETTER_PLUS}->{$i};
|
784 |
+
my $numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{$i};
|
785 |
+
my $annotation = $chart_ht{CHAR_ANNOTATION}->{$i};
|
786 |
+
# print STDERR " $char_name annotation: $annotation\n" if $annotation;
|
787 |
+
my $tone_mark = $chart_ht{CHAR_TONE_MARK}->{$i};
|
788 |
+
my $found_char_mapping_p = 0;
|
789 |
+
my $prev_char_name = ($i >= 1) ? $chart_ht{CHAR_NAME}->{($i-1)} : "";
|
790 |
+
my $prev2_script = ($i >= 2) ? $chart_ht{CHAR_SCRIPT}->{($i-2)} : "";
|
791 |
+
my $prev_script = ($i >= 1) ? $chart_ht{CHAR_SCRIPT}->{($i-1)} : "";
|
792 |
+
my $next_script = ($i < $#chars) ? $chart_ht{CHAR_SCRIPT}->{($i+1)} : "";
|
793 |
+
my $next_char = ($i < $#chars) ? $chart_ht{ORIG_CHAR}->{($i+1)} : "";
|
794 |
+
my $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char} || "";
|
795 |
+
my $prev2_letter_plus_char_p = ($i >= 2) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-2)} : 0;
|
796 |
+
my $prev_letter_plus_char_p = ($i >= 1) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-1)} : 0;
|
797 |
+
my $next_letter_plus_char_p = ($i < $#chars) ? $chart_ht{CHAR_LETTER_PLUS}->{($i+1)} : 0;
|
798 |
+
my $next_index = $i + 1;
|
799 |
+
|
800 |
+
# Braille numeric mode
|
801 |
+
if ($char eq $braille_number_indicator) {
|
802 |
+
my $offset = 0;
|
803 |
+
my $numeric_value = "";
|
804 |
+
my $digit;
|
805 |
+
while ($i+$offset < $#chars) {
|
806 |
+
$offset++;
|
807 |
+
my $offset_char = $chart_ht{ORIG_CHAR}->{$i+$offset};
|
808 |
+
if (defined($digit = $ht{BRAILLE_TO_DIGIT}->{$offset_char})) {
|
809 |
+
$numeric_value .= $digit;
|
810 |
+
} elsif (($offset_char eq $braille_decimal_point)
|
811 |
+
|| ($ht{UTF_CHAR_MAPPING}->{$offset_char}->{"."})) {
|
812 |
+
$numeric_value .= ".";
|
813 |
+
} elsif ($offset_char eq $braille_comma) {
|
814 |
+
$numeric_value .= ",";
|
815 |
+
} elsif ($offset_char eq $braille_numeric_space) {
|
816 |
+
$numeric_value .= " ";
|
817 |
+
} elsif ($offset_char eq $braille_solidus) {
|
818 |
+
$numeric_value .= "/";
|
819 |
+
} elsif ($offset_char eq $braille_number_indicator) {
|
820 |
+
# stay in Braille numeric mode
|
821 |
+
} elsif ($offset_char eq $braille_letter_indicator) {
|
822 |
+
# consider as part of number, but without contributing to numeric_value
|
823 |
+
last;
|
824 |
+
} else {
|
825 |
+
$offset--;
|
826 |
+
last;
|
827 |
+
}
|
828 |
+
}
|
829 |
+
if ($offset) {
|
830 |
+
$next_index = $i + $offset + 1;
|
831 |
+
$node_id = $this->add_node($numeric_value, $i, $next_index, *chart_ht, "", "braille number");
|
832 |
+
$found_char_mapping_p = 1;
|
833 |
+
}
|
834 |
+
}
|
835 |
+
|
836 |
+
unless ($found_char_mapping_p) {
|
837 |
+
foreach $string_length (reverse(1 .. 6)) {
|
838 |
+
next if ($i + $string_length-1) > $#chars;
|
839 |
+
my $start_of_word_p = $chart_ht{CHAR_START_OF_WORD}->{$i} || 0;
|
840 |
+
my $end_of_word_p = $chart_ht{CHAR_END_OF_WORD}->{($i+$string_length-1)} || 0;
|
841 |
+
my $multi_char_substring = join("", @chars[$i..($i+$string_length-1)]);
|
842 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
843 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
|
844 |
+
my @mappings_whole = ();
|
845 |
+
my @mappings_start_or_end = ();
|
846 |
+
my @mappings_other = ();
|
847 |
+
foreach $mapping (@mappings) {
|
848 |
+
next if $mapping =~ /\(__.*__\)/;
|
849 |
+
if ($ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
850 |
+
|| $ht{USE_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$mapping}) {
|
851 |
+
push(@mappings_whole, $mapping) if $start_of_word_p && $end_of_word_p;
|
852 |
+
} elsif ($ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
853 |
+
|| $ht{USE_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
854 |
+
push(@mappings_start_or_end, $mapping) if $start_of_word_p;
|
855 |
+
} elsif ($ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
856 |
+
|| $ht{USE_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
857 |
+
push(@mappings_start_or_end, $mapping) if $end_of_word_p;
|
858 |
+
} else {
|
859 |
+
push(@mappings_other, $mapping);
|
860 |
+
}
|
861 |
+
}
|
862 |
+
@mappings = @mappings_whole;
|
863 |
+
@mappings = @mappings_start_or_end unless @mappings;
|
864 |
+
@mappings = @mappings_other unless @mappings;
|
865 |
+
foreach $mapping (@mappings) {
|
866 |
+
next if $mapping =~ /\(__.*__\)/;
|
867 |
+
if ($ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
868 |
+
|| $ht{DONT_USE_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
869 |
+
next if $start_of_word_p;
|
870 |
+
}
|
871 |
+
if ($ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
872 |
+
|| $ht{DONT_USE_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
873 |
+
next if $end_of_word_p;
|
874 |
+
}
|
875 |
+
my $mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $mapping) : $mapping;
|
876 |
+
$node_id = $this->add_node($mapping2, $i, $i+$string_length, *chart_ht, "", "multi-char-mapping");
|
877 |
+
$next_index = $i + $string_length;
|
878 |
+
$found_char_mapping_p = 1;
|
879 |
+
if ($annotation) {
|
880 |
+
@annotation_elems = split(/,\s*/, $annotation);
|
881 |
+
foreach $annotation_elem (@annotation_elems) {
|
882 |
+
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
|
883 |
+
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
|
884 |
+
} else {
|
885 |
+
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
|
886 |
+
}
|
887 |
+
}
|
888 |
+
}
|
889 |
+
}
|
890 |
+
my @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
891 |
+
@alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING}->{$multi_char_substring}} unless @alt_mappings;
|
892 |
+
@alt_mappings = () if ($#alt_mappings == 0) && ($alt_mappings[0] eq "_NONE_");
|
893 |
+
foreach $alt_mapping (@alt_mappings) {
|
894 |
+
if ($chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$i}) {
|
895 |
+
next unless
|
896 |
+
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
897 |
+
|| $ht{USE_ALT_IN_POINTED}->{$multi_char_substring}->{$alt_mapping};
|
898 |
+
}
|
899 |
+
if ($ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
900 |
+
|| $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
901 |
+
next unless $start_of_word_p && $end_of_word_p;
|
902 |
+
}
|
903 |
+
if ($ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
904 |
+
|| $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
905 |
+
next unless $start_of_word_p;
|
906 |
+
}
|
907 |
+
if ($ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
908 |
+
|| $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
909 |
+
next unless $end_of_word_p;
|
910 |
+
}
|
911 |
+
my $alt_mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $alt_mapping) : $alt_mapping;
|
912 |
+
$node_id = $this->add_node($alt_mapping2, $i, $i+$string_length, *chart_ht, "alt", "multi-char-mapping");
|
913 |
+
if ($annotation) {
|
914 |
+
@annotation_elems = split(/,\s*/, $annotation);
|
915 |
+
foreach $annotation_elem (@annotation_elems) {
|
916 |
+
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
|
917 |
+
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
|
918 |
+
} else {
|
919 |
+
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
|
920 |
+
}
|
921 |
+
}
|
922 |
+
}
|
923 |
+
}
|
924 |
+
}
|
925 |
+
}
|
926 |
+
unless ($found_char_mapping_p) {
|
927 |
+
my $prev_node_id = $this->get_node_for_span($i-4, $i, *chart_ht)
|
928 |
+
|| $this->get_node_for_span($i-3, $i, *chart_ht)
|
929 |
+
|| $this->get_node_for_span($i-2, $i, *chart_ht)
|
930 |
+
|| $this->get_node_for_span($i-1, $i, *chart_ht);
|
931 |
+
my $prev_char_roman = ($prev_node_id) ? $this->get_node_roman($prev_node_id, *chart_id) : "";
|
932 |
+
my $prev_node_start = ($prev_node_id) ? $chart_ht{NODE_START}->{$prev_node_id} : "";
|
933 |
+
|
934 |
+
# Number
|
935 |
+
if (($numeric_value =~ /\d/)
|
936 |
+
&& (! ($char_name =~ /SUPERSCRIPT/))) {
|
937 |
+
my $prev_numeric_value = $this->get_node_for_span_with_slot_value($i-1, $i, "numeric-value", *chart_id);
|
938 |
+
my $sep = "";
|
939 |
+
$sep = " " if ($char_name =~ /^vulgar fraction /i) && ($prev_numeric_value =~ /\d/);
|
940 |
+
$node_id = $this->add_node("$sep$numeric_value", $i, $i+1, *chart_ht, "", "number");
|
941 |
+
$this->set_node_id_slot_value($node_id, "numeric-value", $numeric_value, *chart_ht);
|
942 |
+
if ((($prev_numeric_value =~ /\d/) && ($numeric_value =~ /\d\d/))
|
943 |
+
|| (($prev_numeric_value =~ /\d\d/) && ($numeric_value =~ /\d/))) {
|
944 |
+
# pull in any other parts of single digits
|
945 |
+
my $j = 1;
|
946 |
+
# pull in any single digits adjoining on left
|
947 |
+
if ($prev_numeric_value =~ /^\d$/) {
|
948 |
+
while (1) {
|
949 |
+
if (($i-$j-1 >= 0)
|
950 |
+
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-1, $i-$j, "numeric-value", *chart_id))
|
951 |
+
&& ($digit_value =~ /^\d$/)) {
|
952 |
+
$j++;
|
953 |
+
} elsif (($i-$j-2 >= 0)
|
954 |
+
&& ($chart_ht{ORIG_CHAR}->{($i-$j-1)} =~ /^[.,]$/)
|
955 |
+
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-2, $i-$j-1, "numeric-value", *chart_id))
|
956 |
+
&& ($digit_value =~ /^\d$/)) {
|
957 |
+
$j += 2;
|
958 |
+
} else {
|
959 |
+
last;
|
960 |
+
}
|
961 |
+
}
|
962 |
+
}
|
963 |
+
# pull in any single digits adjoining on right
|
964 |
+
my $k = 0;
|
965 |
+
if ($numeric_value =~ /^\d$/) {
|
966 |
+
while (1) {
|
967 |
+
if (defined($next_numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{($i+$k+1)})
|
968 |
+
&& ($next_numeric_value =~ /^\d$/)) {
|
969 |
+
$k++;
|
970 |
+
} else {
|
971 |
+
last;
|
972 |
+
}
|
973 |
+
}
|
974 |
+
}
|
975 |
+
$this->register_new_complex_number_span_segment($i-$j, $i, $i+$k+1, *chart_ht, $line_number);
|
976 |
+
}
|
977 |
+
if ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
|
978 |
+
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
|
979 |
+
$de_accented_translit = $util->de_accent_string($tonal_translit);
|
980 |
+
if ($numeric_value =~ /^(10000|1000000000000|10000000000000000)$/) {
|
981 |
+
$chart_ht{NODE_TYPE}->{$node_id} = "alt"; # keep, but demote
|
982 |
+
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
|
983 |
+
} else {
|
984 |
+
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "alt", "CJK");
|
985 |
+
}
|
986 |
+
}
|
987 |
+
|
988 |
+
# ASCII
|
989 |
+
} elsif ($char =~ /^[\x00-\x7F]$/) {
|
990 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "ASCII"); # ASCII character, incl. control characters
|
991 |
+
|
992 |
+
# Emoji, dingbats, pictographs
|
993 |
+
} elsif ($char =~ /^(\xE2[\x98-\x9E]|\xF0\x9F[\x8C-\xA7])/) {
|
994 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "pictograph");
|
995 |
+
|
996 |
+
# Hangul (Korean)
|
997 |
+
} elsif (($char =~ /^[\xEA-\xED]/)
|
998 |
+
&& ($romanized_char = $this->unicode_hangul_romanization($char))) {
|
999 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "Hangul");
|
1000 |
+
|
1001 |
+
# CJK (Chinese, Japanese, Korean)
|
1002 |
+
} elsif ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
|
1003 |
+
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
|
1004 |
+
$de_accented_translit = $util->de_accent_string($tonal_translit);
|
1005 |
+
$this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
|
1006 |
+
|
1007 |
+
# Virama (cancel preceding vowel in Abudiga scripts)
|
1008 |
+
} elsif ($char_name =~ /\bSIGN (?:VIRAMA|AL-LAKUNA|ASAT|COENG|PAMAAEH)\b/) {
|
1009 |
+
# VIRAMA: cancel preceding default vowel (in Abudiga scripts)
|
1010 |
+
if (($prev_script eq $current_script)
|
1011 |
+
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
|
1012 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
|
1013 |
+
$this->add_node($prev_char_roman_consonant, $prev_node_start, $i+1, *chart_ht, "", "virama");
|
1014 |
+
} else {
|
1015 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-virama");
|
1016 |
+
}
|
1017 |
+
|
1018 |
+
# Nukta (special (typically foreign) variant)
|
1019 |
+
} elsif ($char_name =~ /\bSIGN (?:NUKTA)\b/) {
|
1020 |
+
# NUKTA (dot): indicates special (typically foreign) variant; normally covered by multi-mappings
|
1021 |
+
if ($prev_script eq $current_script) {
|
1022 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "nukta");
|
1023 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1024 |
+
$this->set_node_id_slot_value($node_id, "nukta", 1, *chart_ht);
|
1025 |
+
} else {
|
1026 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-nukta");
|
1027 |
+
}
|
1028 |
+
|
1029 |
+
# Zero-width character, incl. zero width space/non-joiner/joiner, left-to-right/right-to-left mark
|
1030 |
+
} elsif ($char =~ /^\xE2\x80[\x8B-\x8F\xAA-\xAE]$/) {
|
1031 |
+
if ($prev_node_id) {
|
1032 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
|
1033 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1034 |
+
} else {
|
1035 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "zero-width-char");
|
1036 |
+
}
|
1037 |
+
} elsif (($char =~ /^\xEF\xBB\xBF$/) && $prev_node_id) { # OK to leave byte-order-mark at beginning of line
|
1038 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
|
1039 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1040 |
+
|
1041 |
+
# Tone mark
|
1042 |
+
} elsif ($tone_mark) {
|
1043 |
+
if ($prev_script eq $current_script) {
|
1044 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "tone-mark");
|
1045 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1046 |
+
$this->set_node_id_slot_value($node_id, "tone-mark", $tone_mark, *chart_ht);
|
1047 |
+
} else {
|
1048 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-tone-mark");
|
1049 |
+
}
|
1050 |
+
|
1051 |
+
# Diacritic
|
1052 |
+
} elsif (($char_name =~ /\b(ACCENT|TONE|COMBINING DIAERESIS|COMBINING DIAERESIS BELOW|COMBINING MACRON|COMBINING VERTICAL LINE ABOVE|COMBINING DOT ABOVE RIGHT|COMBINING TILDE|COMBINING CYRILLIC|MUUSIKATOAN|TRIISAP)\b/) && ($ht{UTF_TO_CAT}->{$char} =~ /^Mn/)) {
|
1053 |
+
if ($prev_script eq $current_script) {
|
1054 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "diacritic");
|
1055 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1056 |
+
$diacritic = lc $char_name;
|
1057 |
+
$diacritic =~ s/^.*(?:COMBINING CYRILLIC|COMBINING|SIGN)\s+//i;
|
1058 |
+
$diacritic =~ s/^.*(ACCENT|TONE)/$1/i;
|
1059 |
+
$diacritic =~ s/^\s*//;
|
1060 |
+
$this->set_node_id_slot_value($node_id, "diacritic", $diacritic, *chart_ht);
|
1061 |
+
# print STDERR "diacritic: $diacritic\n";
|
1062 |
+
} else {
|
1063 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-diacritic");
|
1064 |
+
}
|
1065 |
+
|
1066 |
+
# Romanize to find out more
|
1067 |
+
} elsif ($char_name) {
|
1068 |
+
if (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))) {
|
1069 |
+
# print STDERR "ROM l.$line_number/$i: $romanized_char\n" if $line_number =~ /^[12]$/;
|
1070 |
+
print STDOUT "ROM l.$line_number/$i: $romanized_char\n" if $verbosePM;
|
1071 |
+
|
1072 |
+
# Empty string mapping
|
1073 |
+
if ($romanized_char eq "\"\"") {
|
1074 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "empty-string-mapping");
|
1075 |
+
# consider adding something for implausible romanizations of length 6+
|
1076 |
+
|
1077 |
+
# keep original character (instead of romanized_char lengthener, character-18b00 etc.)
|
1078 |
+
} elsif (($romanized_char =~ /^(character|lengthener|modifier)/)) {
|
1079 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "nevermind-keep-original");
|
1080 |
+
|
1081 |
+
# Syllabic suffix in Abudiga languages, e.g. -m, -ng
|
1082 |
+
} elsif (($romanized_char =~ /^\+(H|M|N|NG)$/i)
|
1083 |
+
&& ($prev_script eq $current_script)
|
1084 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{"a"})) {
|
1085 |
+
my $core_suffix = $romanized_char;
|
1086 |
+
$core_suffix =~ s/^\+//;
|
1087 |
+
if ($prev_char_roman =~ /[aeiou]$/i) {
|
1088 |
+
$this->add_node($core_suffix, $i, $i+1, *chart_ht, "", "syllable-end-consonant");
|
1089 |
+
} else {
|
1090 |
+
$this->add_node(join("", $prev_char_roman, "a", $core_suffix), $prev_node_start, $i+1, *chart_ht, "", "syllable-end-consonant-with-added-a");
|
1091 |
+
$this->add_node(join("", "a", $core_suffix), $i, $i+1, *chart_ht, "backup", "syllable-end-consonant");
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
# Japanese special cases
|
1095 |
+
} elsif ($char_name =~ /(?:HIRAGANA|KATAKANA) LETTER SMALL Y/) {
|
1096 |
+
if (($prev_script eq $current_script)
|
1097 |
+
&& (($prev_char_roman_consonant) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])i$/i))) {
|
1098 |
+
unless ($this->get_node_for_span_and_type($prev_node_start, $i+1, *chart_ht, "")) {
|
1099 |
+
$this->add_node("$prev_char_roman_consonant$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "japanese-contraction");
|
1100 |
+
}
|
1101 |
+
} else {
|
1102 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "unexpected-japanese-contraction-character");
|
1103 |
+
}
|
1104 |
+
} elsif (($prev_script =~ /^(HIRAGANA|KATAKANA)$/i)
|
1105 |
+
&& ($char_name eq "KATAKANA-HIRAGANA PROLONGED SOUND MARK") # Choonpu
|
1106 |
+
&& (($prev_char_roman_vowel) = ($prev_char_roman =~ /([aeiou])$/i))) {
|
1107 |
+
$this->add_node("$prev_char_roman$prev_char_roman_vowel", $prev_node_start, $i+1, *chart_ht, "", "japanese-vowel-lengthening");
|
1108 |
+
} elsif (($current_script =~ /^(Hiragana|Katakana)$/i)
|
1109 |
+
&& ($char_name =~ /^(HIRAGANA|KATAKANA) LETTER SMALL TU$/i) # Sokuon/Sukun
|
1110 |
+
&& ($next_script eq $current_script)
|
1111 |
+
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
|
1112 |
+
&& (($doubled_consonant) = ($romanized_next_char =~ /^(ch|[bcdfghjklmnpqrstwz])/i))) {
|
1113 |
+
# Note: $romanized_next_char could be part of a multi-character mapping
|
1114 |
+
# print STDERR "current_script: $current_script char_name: $char_name next_script: $next_script romanized_next_char: $romanized_next_char doubled_consonant: $doubled_consonant\n";
|
1115 |
+
$doubled_consonant = "t" if $doubled_consonant eq "ch";
|
1116 |
+
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "japanese-consonant-doubling");
|
1117 |
+
|
1118 |
+
# Greek small letter mu to micro-sign (instead of to "m") as used in abbreviations for microgram/micrometer/microliter/microsecond/micromolar/microfarad etc.
|
1119 |
+
} elsif (($char_name eq "GREEK SMALL LETTER MU")
|
1120 |
+
&& (! ($prev_script =~ /^GREEK$/))
|
1121 |
+
&& ($i < $#chars)
|
1122 |
+
&& ($chart_ht{ORIG_CHAR}->{($i+1)} =~ /^[cfgjlmstv]$/i)) {
|
1123 |
+
$this->add_node("\xC2\xB5", $i, $i+1, *chart_ht, "", "greek-mu-to-micro-sign");
|
1124 |
+
|
1125 |
+
# Gurmukhi addak (doubles following consonant)
|
1126 |
+
} elsif (($current_script eq "Gurmukhi")
|
1127 |
+
&& ($char_name eq "GURMUKHI ADDAK")) {
|
1128 |
+
if (($next_script eq $current_script)
|
1129 |
+
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
|
1130 |
+
&& (($doubled_consonant) = ($romanized_next_char =~ /^([bcdfghjklmnpqrstvwxz])/i))) {
|
1131 |
+
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "gurmukhi-consonant-doubling");
|
1132 |
+
} else {
|
1133 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "gurmukhi-unexpected-addak");
|
1134 |
+
}
|
1135 |
+
|
1136 |
+
# Subjoined character
|
1137 |
+
} elsif ($subjoined_char_p
|
1138 |
+
&& ($prev_script eq $current_script)
|
1139 |
+
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
|
1140 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
|
1141 |
+
my $new_roman = "$prev_char_roman_consonant$romanized_char";
|
1142 |
+
$this->add_node($new_roman, $prev_node_start, $i+1, *chart_ht, "", "subjoined-character");
|
1143 |
+
# print STDERR " Subjoin l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
|
1144 |
+
|
1145 |
+
# Thai special case: written-pre-consonant-spoken-post-consonant
|
1146 |
+
} elsif (($char_name =~ /THAI CHARACTER/)
|
1147 |
+
&& ($prev_script eq $current_script)
|
1148 |
+
&& ($chart_ht{CHAR_SYLLABLE_INFO}->{($i-1)} =~ /written-pre-consonant-spoken-post-consonant/i)
|
1149 |
+
&& ($prev_char_roman =~ /^[aeiou]+$/i)
|
1150 |
+
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]/)) {
|
1151 |
+
$this->add_node("$romanized_char$prev_char_roman", $prev_node_start, $i+1, *chart_ht, "", "thai-vowel-consonant-swap");
|
1152 |
+
|
1153 |
+
# Thai special case: THAI CHARACTER O ANG (U+0E2D "\xE0\xB8\xAD")
|
1154 |
+
} elsif ($char_name eq "THAI CHARACTER O ANG") {
|
1155 |
+
if ($prev_script ne $current_script) {
|
1156 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-initial-o-ang-drop");
|
1157 |
+
} elsif ($next_script ne $current_script) {
|
1158 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-final-o-ang-drop");
|
1159 |
+
} else {
|
1160 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1161 |
+
my $romanized_prev2_char = $this->romanize_char_at_position($i-2, $lang_code, $output_style, *ht, *chart_ht);
|
1162 |
+
if (($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
|
1163 |
+
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
|
1164 |
+
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonants
|
1165 |
+
} elsif (($prev2_script eq $current_script)
|
1166 |
+
&& 0
|
1167 |
+
&& ($prev_char_name =~ /^THAI CHARACTER MAI [A-Z]+$/) # Thai tone
|
1168 |
+
&& ($romanized_prev2_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
|
1169 |
+
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
|
1170 |
+
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonant+tone-mark and consonant
|
1171 |
+
} else {
|
1172 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-middle-o-ang-drop"); # drop next to vowel
|
1173 |
+
}
|
1174 |
+
}
|
1175 |
+
|
1176 |
+
# Romanization with space
|
1177 |
+
} elsif ($romanized_char =~ /\s/) {
|
1178 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "space");
|
1179 |
+
|
1180 |
+
# Tibetan special cases
|
1181 |
+
} elsif ($current_script eq "Tibetan") {
|
1182 |
+
|
1183 |
+
if ($subjoined_char_p
|
1184 |
+
&& ($prev_script eq $current_script)
|
1185 |
+
&& $prev_letter_plus_char_p
|
1186 |
+
&& ($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
|
1187 |
+
$this->add_node("$prev_char_roman$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "subjoined-tibetan-character");
|
1188 |
+
} elsif ($romanized_char =~ /^-A$/i) {
|
1189 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1190 |
+
if (! $prev_letter_plus_char_p) {
|
1191 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-frontal-dash-a");
|
1192 |
+
} elsif (($prev_script eq $current_script)
|
1193 |
+
&& ($next_script eq $current_script)
|
1194 |
+
&& ($prev_char_roman =~ /[bcdfghjklmnpqrstvwxyz]$/)
|
1195 |
+
&& ($romanized_next_char =~ /^[aeiou]/)) {
|
1196 |
+
$this->add_node("a'", $i, $i+1, *chart_ht, "", "tibetan-medial-dash-a");
|
1197 |
+
} elsif (($prev_script eq $current_script)
|
1198 |
+
&& ($next_script eq $current_script)
|
1199 |
+
&& ($prev_char_roman =~ /[aeiou]$/)
|
1200 |
+
&& ($romanized_next_char =~ /[aeiou]/)) {
|
1201 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-reduced-medial-dash-a");
|
1202 |
+
} elsif (($prev_script eq $current_script)
|
1203 |
+
&& (! ($prev_char_roman =~ /[aeiou]/))
|
1204 |
+
&& (! $next_letter_plus_char_p)) {
|
1205 |
+
$this->add_node("a", $i, $i+1, *chart_ht, "", "tibetan-final-dash-a");
|
1206 |
+
} else {
|
1207 |
+
$this->add_node("a", $i, $i+1, *chart_ht, "", "unexpected-tibetan-dash-a");
|
1208 |
+
}
|
1209 |
+
} elsif (($romanized_char =~ /^[AEIOU]/i)
|
1210 |
+
&& ($prev_script eq $current_script)
|
1211 |
+
&& ($prev_char_roman =~ /^A$/i)
|
1212 |
+
&& (! $prev2_letter_plus_char_p)) {
|
1213 |
+
$this->add_node($romanized_char, $prev_node_start, $i+1, *chart_ht, "", "tibetan-dropped-word-initial-a");
|
1214 |
+
} else {
|
1215 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
|
1216 |
+
}
|
1217 |
+
|
1218 |
+
# Khmer (for MUUSIKATOAN etc. see under "Diacritic" above)
|
1219 |
+
} elsif (($current_script eq "Khmer")
|
1220 |
+
&& (($char_roman_consonant, $char_roman_vowel) = ($romanized_char =~ /^(.*[bcdfghjklmnpqrstvwxyz])([ao]+)-$/i))) {
|
1221 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1222 |
+
if (($next_script eq $current_script)
|
1223 |
+
&& ($romanized_next_char =~ /^[aeiouy]/i)) {
|
1224 |
+
$this->add_node($char_roman_consonant, $i, $i+1, *chart_ht, "", "khmer-vowel-drop");
|
1225 |
+
} else {
|
1226 |
+
$this->add_node("$char_roman_consonant$char_roman_vowel", $i, $i+1, *chart_ht, "", "khmer-standard-unicode-based-romanization");
|
1227 |
+
}
|
1228 |
+
|
1229 |
+
# Abudiga add default vowel
|
1230 |
+
} elsif ((@abudiga_default_vowels = sort keys %{$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}})
|
1231 |
+
&& ($abudiga_default_vowel = $abudiga_default_vowels[0])
|
1232 |
+
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
|
1233 |
+
my $new_roman = join("", $romanized_char, $abudiga_default_vowel);
|
1234 |
+
$this->add_node($new_roman, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization-plus-abudiga-default-vowel");
|
1235 |
+
# print STDERR " Abudiga add default vowel l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
|
1236 |
+
|
1237 |
+
# Standard romanization
|
1238 |
+
} else {
|
1239 |
+
$node_id = $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
|
1240 |
+
}
|
1241 |
+
} else {
|
1242 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original");
|
1243 |
+
}
|
1244 |
+
} elsif (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))
|
1245 |
+
&& ((length($romanized_char) <= 2)
|
1246 |
+
|| ($ht{UTF_TO_CHAR_ROMANIZATION}->{$char}))) { # or from unicode_overwrite_romanization table
|
1247 |
+
$romanized_char =~ s/^""$//;
|
1248 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "romanized-without-character-name");
|
1249 |
+
} else {
|
1250 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original-without-character-name");
|
1251 |
+
}
|
1252 |
+
}
|
1253 |
+
$i = $next_index;
|
1254 |
+
}
|
1255 |
+
|
1256 |
+
$this->schwa_deletion(0, $n_characters, *chart_ht, $lang_code);
|
1257 |
+
$this->default_vowelize_tibetan(0, $n_characters, *chart_ht, $lang_code, $line_number) if $chart_ht{CHART_CONTAINS_SCRIPT}->{"Tibetan"};
|
1258 |
+
$this->assemble_numbers_in_chart(*chart_ht, $line_number);
|
1259 |
+
|
1260 |
+
if ($return_chart_p) {
|
1261 |
+
} elsif ($return_offset_mappings_p) {
|
1262 |
+
($result, $offset_mappings, $new_char_offset, $new_rom_char_offset) = $this->best_romanized_string(0, $n_characters, *chart_ht, $control, $initial_char_offset, $initial_rom_char_offset);
|
1263 |
+
} else {
|
1264 |
+
$result = $this->best_romanized_string(0, $n_characters, *chart_ht) unless $return_chart_p;
|
1265 |
+
}
|
1266 |
+
|
1267 |
+
if ($verbosePM) {
|
1268 |
+
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-log.txt";
|
1269 |
+
$util->append_to_file($logfile, $log) if $log && (-r $logfile);
|
1270 |
+
}
|
1271 |
+
|
1272 |
+
return ($result, $offset_mappings) if $return_offset_mappings_p;
|
1273 |
+
return *chart_ht if $return_chart_p;
|
1274 |
+
return $result;
|
1275 |
+
}
|
1276 |
+
|
1277 |
+
sub string_to_json_string {
|
1278 |
+
local($this, $s) = @_;
|
1279 |
+
|
1280 |
+
utf8::decode($s);
|
1281 |
+
my $j = JSON->new->utf8->encode([$s]);
|
1282 |
+
$j =~ s/^\[(.*)\]$/$1/;
|
1283 |
+
return $j;
|
1284 |
+
}
|
1285 |
+
|
1286 |
+
sub chart_to_json_romanization_elements {
|
1287 |
+
local($this, $chart_start, $chart_end, *chart_ht, $line_number) = @_;
|
1288 |
+
|
1289 |
+
my $result = "";
|
1290 |
+
my $start = $chart_start;
|
1291 |
+
my $end;
|
1292 |
+
while ($start < $chart_end) {
|
1293 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1294 |
+
my @best_romanizations;
|
1295 |
+
if (($end && ($start < $end))
|
1296 |
+
&& (@best_romanizations = $this->best_romanizations($start, $end, *chart_ht))) {
|
1297 |
+
$orig_segment = $this->orig_string_at_span($start, $end, *chart_ht);
|
1298 |
+
$next_start = $end;
|
1299 |
+
} else {
|
1300 |
+
$orig_segment = $chart_ht{ORIG_CHAR}->{$start};
|
1301 |
+
@best_romanizations = ($orig);
|
1302 |
+
$next_start = $start + 1;
|
1303 |
+
}
|
1304 |
+
$exclusive_end = $end - 1;
|
1305 |
+
# $guarded_orig = $util->string_guard($orig_segment);
|
1306 |
+
$guarded_orig = $this->string_to_json_string($orig_segment);
|
1307 |
+
$result .= " { \"line\": $line_number, \"start\": $start, \"end\": $exclusive_end, \"orig\": $guarded_orig, \"roms\": [";
|
1308 |
+
foreach $i ((0 .. $#best_romanizations)) {
|
1309 |
+
my $rom = $best_romanizations[$i];
|
1310 |
+
# my $guarded_rom = $util->string_guard($rom);
|
1311 |
+
my $guarded_rom = $this->string_to_json_string($rom);
|
1312 |
+
$result .= " { \"rom\": $guarded_rom";
|
1313 |
+
# $result .= ", \"alt\": true" if $i >= 1;
|
1314 |
+
$result .= " }";
|
1315 |
+
$result .= "," if $i < $#best_romanizations;
|
1316 |
+
}
|
1317 |
+
$result .= " ] },\n";
|
1318 |
+
$start = $next_start;
|
1319 |
+
}
|
1320 |
+
return $result;
|
1321 |
+
}
|
1322 |
+
|
1323 |
+
sub default_vowelize_tibetan {
|
1324 |
+
local($this, $chart_start, $chart_end, *chart_ht, $lang_code, $line_number) = @_;
|
1325 |
+
|
1326 |
+
# my $verbose = ($line_number == 103);
|
1327 |
+
# print STDERR "\nStart default_vowelize_tibetan l.$line_number $chart_start-$chart_end\n" if $verbose;
|
1328 |
+
my $token_start = $chart_start;
|
1329 |
+
my $next_token_start = $chart_start;
|
1330 |
+
while (($token_start = $next_token_start) < $chart_end) {
|
1331 |
+
$next_token_start = $token_start + 1;
|
1332 |
+
|
1333 |
+
next unless $chart_ht{CHAR_LETTER_PLUS}->{$token_start};
|
1334 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$token_start};
|
1335 |
+
next unless ($current_script eq "Tibetan");
|
1336 |
+
my $token_end = $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$token_start};
|
1337 |
+
next unless $token_end;
|
1338 |
+
next unless $token_end > $token_start;
|
1339 |
+
$next_token_start = $token_end;
|
1340 |
+
|
1341 |
+
my $start = $token_start;
|
1342 |
+
my $end;
|
1343 |
+
my @node_ids = ();
|
1344 |
+
while ($start < $token_end) {
|
1345 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1346 |
+
last unless $end && ($end > $start);
|
1347 |
+
my @alt_node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
1348 |
+
last unless @alt_node_ids;
|
1349 |
+
push(@node_ids, $alt_node_ids[0]);
|
1350 |
+
$start = $end;
|
1351 |
+
}
|
1352 |
+
my $contains_vowel_p = 0;
|
1353 |
+
my @romanizations = ();
|
1354 |
+
foreach $node_id (@node_ids) {
|
1355 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
1356 |
+
$roman = "" unless defined($roman);
|
1357 |
+
push(@romanizations, $roman);
|
1358 |
+
$contains_vowel_p = 1 if $roman =~ /[aeiou]/i;
|
1359 |
+
}
|
1360 |
+
# print STDERR " old: $token_start-$token_end @romanizations\n" if $verbose;
|
1361 |
+
unless ($contains_vowel_p) {
|
1362 |
+
my $default_vowel_target_index;
|
1363 |
+
if ($#node_ids <= 1) {
|
1364 |
+
$default_vowel_target_index = 0;
|
1365 |
+
} elsif ($romanizations[$#romanizations] eq "s") {
|
1366 |
+
if ($romanizations[($#romanizations-1)] eq "y") {
|
1367 |
+
$default_vowel_target_index = $#romanizations-1;
|
1368 |
+
} else {
|
1369 |
+
$default_vowel_target_index = $#romanizations-2;
|
1370 |
+
}
|
1371 |
+
} else {
|
1372 |
+
$default_vowel_target_index = $#romanizations-1;
|
1373 |
+
}
|
1374 |
+
$romanizations[$default_vowel_target_index] .= "a";
|
1375 |
+
my $old_node_id = $node_ids[$default_vowel_target_index];
|
1376 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1377 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1378 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1379 |
+
my $new_roman = $old_roman . "a";
|
1380 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-default-vowel");
|
1381 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1382 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "backup"; # keep, but demote
|
1383 |
+
}
|
1384 |
+
if (($romanizations[0] eq "'")
|
1385 |
+
&& ($#romanizations >= 1)
|
1386 |
+
&& ($romanizations[1] =~ /^[o]$/)) {
|
1387 |
+
my $old_node_id = $node_ids[0];
|
1388 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1389 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1390 |
+
my $new_node_id = $this->add_node("", $old_start, $old_end, *chart_ht, "", "tibetan-delete-apostrophe");
|
1391 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1392 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1393 |
+
}
|
1394 |
+
if (($#node_ids >= 1)
|
1395 |
+
&& ($romanizations[$#romanizations] =~ /^[bcdfghjklmnpqrstvwxz]+y$/)) {
|
1396 |
+
my $old_node_id = $node_ids[$#romanizations];
|
1397 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1398 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1399 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1400 |
+
my $new_roman = $old_roman . "a";
|
1401 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-final-vowel");
|
1402 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1403 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1404 |
+
}
|
1405 |
+
foreach $old_node_id (@node_ids) {
|
1406 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1407 |
+
next unless $old_roman =~ /-a/;
|
1408 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1409 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1410 |
+
my $new_roman = $old_roman;
|
1411 |
+
$new_roman =~ s/-a/a/;
|
1412 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-delete-dash");
|
1413 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1414 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1415 |
+
}
|
1416 |
+
}
|
1417 |
+
}
|
1418 |
+
|
1419 |
+
sub schwa_deletion {
|
1420 |
+
local($this, $chart_start, $chart_end, *chart_ht, $lang_code) = @_;
|
1421 |
+
# delete word-final simple "a" in Devanagari (e.g. nepaala -> nepaal)
|
1422 |
+
# see Wikipedia article "Schwa deletion in Indo-Aryan languages"
|
1423 |
+
|
1424 |
+
if ($chart_ht{CHART_CONTAINS_SCRIPT}->{"Devanagari"}) {
|
1425 |
+
my $script_start = $chart_start;
|
1426 |
+
my $next_script_start = $chart_start;
|
1427 |
+
while (($script_start = $next_script_start) < $chart_end) {
|
1428 |
+
$next_script_start = $script_start + 1;
|
1429 |
+
|
1430 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$script_start};
|
1431 |
+
next unless ($current_script eq "Devanagari");
|
1432 |
+
my $script_end = $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start};
|
1433 |
+
next unless $script_end;
|
1434 |
+
next unless $script_end - $script_start >= 2;
|
1435 |
+
$next_script_start = $script_end;
|
1436 |
+
my $end_node_id = $this->get_node_for_span($script_end-1, $script_end, *chart_ht);
|
1437 |
+
next unless $end_node_id;
|
1438 |
+
my $end_roman = $chart_ht{NODE_ROMAN}->{$end_node_id};
|
1439 |
+
next unless ($end_consonant) = ($end_roman =~ /^([bcdfghjklmnpqrstvwxz]+)a$/i);
|
1440 |
+
my $prev_node_id = $this->get_node_for_span($script_end-4, $script_end-1, *chart_ht)
|
1441 |
+
|| $this->get_node_for_span($script_end-3, $script_end-1, *chart_ht)
|
1442 |
+
|| $this->get_node_for_span($script_end-2, $script_end-1, *chart_ht);
|
1443 |
+
next unless $prev_node_id;
|
1444 |
+
my $prev_roman = $chart_ht{NODE_ROMAN}->{$prev_node_id};
|
1445 |
+
next unless $prev_roman =~ /[aeiou]/i;
|
1446 |
+
# TO DO: check further back for vowel (e.g. if $prev_roman eq "r" due to vowel cancelation)
|
1447 |
+
|
1448 |
+
$chart_ht{NODE_TYPE}->{$end_node_id} = "alt"; # keep, but demote
|
1449 |
+
# print STDERR "* Schwa deletion " . ($script_end-1) . "-$script_end $end_roman->$end_consonant\n";
|
1450 |
+
$this->add_node($end_consonant, $script_end-1, $script_end, *chart_ht, "", "devanagari-with-deleted-final-schwa");
|
1451 |
+
}
|
1452 |
+
}
|
1453 |
+
}
|
1454 |
+
|
1455 |
+
sub best_romanized_string {
|
1456 |
+
local($this, $chart_start, $chart_end, *chart_ht, $control, $orig_char_offset, $rom_char_offset) = @_;
|
1457 |
+
|
1458 |
+
$control = "" unless defined($control);
|
1459 |
+
my $current_orig_char_offset = $orig_char_offset || 0;
|
1460 |
+
my $current_rom_char_offset = $rom_char_offset || 0;
|
1461 |
+
my $return_offset_mappings_p = ($control =~ /\breturn offset mappings\b/);
|
1462 |
+
my $result = "";
|
1463 |
+
my $start = $chart_start;
|
1464 |
+
my $end;
|
1465 |
+
my @char_offsets = ("$current_orig_char_offset:$current_rom_char_offset");
|
1466 |
+
while ($start < $chart_end) {
|
1467 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1468 |
+
my $n_orig_chars_in_segment = 0;
|
1469 |
+
my $n_rom_chars_in_segment = 0;
|
1470 |
+
if ($end && ($start < $end)) {
|
1471 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1472 |
+
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
|
1473 |
+
if (defined($best_romanization)) {
|
1474 |
+
$result .= $best_romanization;
|
1475 |
+
if ($return_offset_mappings_p) {
|
1476 |
+
$n_orig_chars_in_segment = $end-$start;
|
1477 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1478 |
+
}
|
1479 |
+
$start = $end;
|
1480 |
+
} else {
|
1481 |
+
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
|
1482 |
+
$result .= $best_romanization;
|
1483 |
+
$start++;
|
1484 |
+
if ($return_offset_mappings_p) {
|
1485 |
+
$n_orig_chars_in_segment = 1;
|
1486 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1487 |
+
}
|
1488 |
+
}
|
1489 |
+
} else {
|
1490 |
+
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
|
1491 |
+
$result .= $best_romanization;
|
1492 |
+
$start++;
|
1493 |
+
if ($return_offset_mappings_p) {
|
1494 |
+
$n_orig_chars_in_segment = 1;
|
1495 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1496 |
+
}
|
1497 |
+
}
|
1498 |
+
if ($return_offset_mappings_p) {
|
1499 |
+
my $new_orig_char_offset = $current_orig_char_offset + $n_orig_chars_in_segment;
|
1500 |
+
my $new_rom_char_offset = $current_rom_char_offset + $n_rom_chars_in_segment;
|
1501 |
+
my $offset_mapping = "$new_orig_char_offset:$new_rom_char_offset";
|
1502 |
+
push(@char_offsets, $offset_mapping);
|
1503 |
+
$current_orig_char_offset = $new_orig_char_offset;
|
1504 |
+
$current_rom_char_offset = $new_rom_char_offset;
|
1505 |
+
}
|
1506 |
+
}
|
1507 |
+
return ($result, join(",", @char_offsets), $current_orig_char_offset, $current_rom_char_offset) if $return_offset_mappings_p;
|
1508 |
+
return $result;
|
1509 |
+
}
|
1510 |
+
|
1511 |
+
sub orig_string_at_span {
|
1512 |
+
local($this, $start, $end, *chart_ht) = @_;
|
1513 |
+
|
1514 |
+
my $result = "";
|
1515 |
+
foreach $i (($start .. ($end-1))) {
|
1516 |
+
$result .= $chart_ht{ORIG_CHAR}->{$i};
|
1517 |
+
}
|
1518 |
+
return $result;
|
1519 |
+
}
|
1520 |
+
|
1521 |
+
sub find_end_of_rom_segment {
|
1522 |
+
local($this, $start, $chart_end, *chart_ht) = @_;
|
1523 |
+
|
1524 |
+
my @ends = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}};
|
1525 |
+
my $end_index = $#ends;
|
1526 |
+
while (($end_index >= 0) && ($ends[$end_index] > $chart_end)) {
|
1527 |
+
$end_index--;
|
1528 |
+
}
|
1529 |
+
if (($end_index >= 0)
|
1530 |
+
&& defined($end = $ends[$end_index])
|
1531 |
+
&& ($start < $end)) {
|
1532 |
+
return $end;
|
1533 |
+
} else {
|
1534 |
+
return "";
|
1535 |
+
}
|
1536 |
+
}
|
1537 |
+
|
1538 |
+
sub best_romanizations {
|
1539 |
+
local($this, $start, $end, *chart_ht) = @_;
|
1540 |
+
|
1541 |
+
@regular_romanizations = ();
|
1542 |
+
@alt_romanizations = ();
|
1543 |
+
@backup_romanizations = ();
|
1544 |
+
|
1545 |
+
foreach $node_id (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
|
1546 |
+
my $type = $chart_ht{NODE_TYPE}->{$node_id};
|
1547 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
1548 |
+
if (! defined($roman)) {
|
1549 |
+
# ignore
|
1550 |
+
} elsif (($type eq "backup") && ! defined($backup_romanization)) {
|
1551 |
+
push(@backup_romanizations, $roman) unless $util->member($roman, @backup_romanizations);
|
1552 |
+
} elsif (($type eq "alt") && ! defined($alt_romanization)) {
|
1553 |
+
push(@alt_romanizations, $roman) unless $util->member($roman, @alt_romanizations);
|
1554 |
+
} else {
|
1555 |
+
push(@regular_romanizations, $roman) unless $util->member($roman, @regular_romanizations);
|
1556 |
+
}
|
1557 |
+
}
|
1558 |
+
@regular_alt_romanizations = sort @regular_romanizations;
|
1559 |
+
foreach $alt_romanization (sort @alt_romanizations) {
|
1560 |
+
push(@regular_alt_romanizations, $alt_romanization) unless $util->member($alt_romanization, @regular_alt_romanizations);
|
1561 |
+
}
|
1562 |
+
return @regular_alt_romanizations if @regular_alt_romanizations;
|
1563 |
+
return sort @backup_romanizations;
|
1564 |
+
}
|
1565 |
+
|
1566 |
+
sub join_alt_romanizations_for_viz {
|
1567 |
+
local($this, @list) = @_;
|
1568 |
+
|
1569 |
+
my @viz_romanizations = ();
|
1570 |
+
|
1571 |
+
foreach $alt_rom (@list) {
|
1572 |
+
if ($alt_rom eq "") {
|
1573 |
+
push(@viz_romanizations, "-");
|
1574 |
+
} else {
|
1575 |
+
push(@viz_romanizations, $alt_rom);
|
1576 |
+
}
|
1577 |
+
}
|
1578 |
+
return join(", ", @viz_romanizations);
|
1579 |
+
}
|
1580 |
+
|
1581 |
+
sub markup_orig_rom_strings {
|
1582 |
+
local($this, $chart_start, $chart_end, *ht, *chart_ht, *pinyin_ht, $last_group_id_index) = @_;
|
1583 |
+
|
1584 |
+
my $marked_up_rom = "";
|
1585 |
+
my $marked_up_orig = "";
|
1586 |
+
my $start = $chart_start;
|
1587 |
+
my $end;
|
1588 |
+
while ($start < $chart_end) {
|
1589 |
+
my $segment_start = $start;
|
1590 |
+
my $segment_end = $start+1;
|
1591 |
+
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1592 |
+
my $rom_segment = "";
|
1593 |
+
my $orig_segment = "";
|
1594 |
+
my $rom_title = "";
|
1595 |
+
my $orig_title = "";
|
1596 |
+
my $contains_alt_romanizations = 0;
|
1597 |
+
if ($end) {
|
1598 |
+
$segment_end = $end;
|
1599 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1600 |
+
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
|
1601 |
+
if (defined($best_romanization)) {
|
1602 |
+
$rom_segment .= $best_romanization;
|
1603 |
+
$orig_segment .= $this->orig_string_at_span($start, $end, *chart_ht);
|
1604 |
+
$segment_end = $end;
|
1605 |
+
if ($#best_romanizations >= 1) {
|
1606 |
+
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
|
1607 |
+
$contains_alt_romanizations = 1;
|
1608 |
+
}
|
1609 |
+
} else {
|
1610 |
+
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1611 |
+
$rom_segment .= $segment;
|
1612 |
+
$orig_segment .= $segment;
|
1613 |
+
$segment_end = $start+1;
|
1614 |
+
}
|
1615 |
+
$start = $segment_end;
|
1616 |
+
} else {
|
1617 |
+
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
|
1618 |
+
$orig_segment .= $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1619 |
+
$segment_end = $start+1;
|
1620 |
+
$start = $segment_end;
|
1621 |
+
}
|
1622 |
+
my $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
|
1623 |
+
my $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1624 |
+
while ($next_char_is_combining_p
|
1625 |
+
&& ($segment_end < $chart_end)
|
1626 |
+
&& ($end = $this->find_end_of_rom_segment($segment_end, $chart_end, *chart_ht))
|
1627 |
+
&& ($end > $segment_end)
|
1628 |
+
&& (@best_romanizations = $this->best_romanizations($segment_end, $end, *chart_ht))
|
1629 |
+
&& defined($best_romanization = $best_romanizations[0])) {
|
1630 |
+
$orig_segment .= $this->orig_string_at_span($segment_end, $end, *chart_ht);
|
1631 |
+
$rom_segment .= $best_romanization;
|
1632 |
+
if ($#best_romanizations >= 1) {
|
1633 |
+
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
|
1634 |
+
$contains_alt_romanizations = 1;
|
1635 |
+
}
|
1636 |
+
$segment_end = $end;
|
1637 |
+
$start = $segment_end;
|
1638 |
+
$next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
|
1639 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1640 |
+
}
|
1641 |
+
foreach $i (($segment_start .. ($segment_end-1))) {
|
1642 |
+
$orig_title .= "+‎ ‎" unless $orig_title eq "";
|
1643 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1644 |
+
my $numeric = $ht{UTF_TO_NUMERIC}->{$char};
|
1645 |
+
$numeric = "" unless defined($numeric);
|
1646 |
+
my $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
|
1647 |
+
$pic_descr = "" unless defined($pic_descr);
|
1648 |
+
if ($char =~ /^\xE4\xB7[\x80-\xBF]$/) {
|
1649 |
+
$orig_title .= "$char_name\n";
|
1650 |
+
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
|
1651 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1652 |
+
$orig_title .= "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1653 |
+
$orig_title .= "Chinese: $tonal_translit\n" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
|
1654 |
+
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
|
1655 |
+
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
|
1656 |
+
$orig_title .= "$char_name\n";
|
1657 |
+
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
|
1658 |
+
$orig_title .= "Picture: $pic_descr\n" if $pic_descr =~ /\S/;
|
1659 |
+
} else {
|
1660 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1661 |
+
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
1662 |
+
$orig_title .= "Hangul syllable U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1663 |
+
} else {
|
1664 |
+
$orig_title .= "Unicode character U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1665 |
+
}
|
1666 |
+
}
|
1667 |
+
}
|
1668 |
+
(@non_ascii_roms) = ($rom_segment =~ /([\xC0-\xFF][\x80-\xBF]*)/g);
|
1669 |
+
foreach $char (@non_ascii_roms) {
|
1670 |
+
my $char_name = $ht{UTF_TO_CHAR_NAME}->{$char};
|
1671 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1672 |
+
my $unicode_s = "U+" . (uc sprintf("%04x", $unicode));
|
1673 |
+
if ($char_name) {
|
1674 |
+
$rom_title .= "$char_name\n";
|
1675 |
+
} else {
|
1676 |
+
$rom_title .= "$unicode_s\n";
|
1677 |
+
}
|
1678 |
+
}
|
1679 |
+
$last_group_id_index++;
|
1680 |
+
$rom_title =~ s/\s*$//;
|
1681 |
+
$rom_title =~ s/\n/
/g;
|
1682 |
+
$orig_title =~ s/\s*$//;
|
1683 |
+
$orig_title =~ s/\n/
‎/g;
|
1684 |
+
$orig_title = "‭" . $orig_title . "‬";
|
1685 |
+
my $rom_title_clause = ($rom_title eq "") ? "" : " title=\"$rom_title\"";
|
1686 |
+
my $orig_title_clause = ($orig_title eq "") ? "" : " title=\"$orig_title\"";
|
1687 |
+
my $alt_rom_clause = ($contains_alt_romanizations) ? "border-bottom:1px dotted;" : "";
|
1688 |
+
$marked_up_rom .= "<span id=\"span-$last_group_id_index-1\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\" style=\"color:#00BB00;$alt_rom_clause\"$rom_title_clause>" . $util->guard_html($rom_segment) . "<\/span>";
|
1689 |
+
$marked_up_orig .= "<span id=\"span-$last_group_id_index-2\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\"$orig_title_clause>" . $util->guard_html($orig_segment) . "<\/span>";
|
1690 |
+
if (($last_char = $chart_ht{ORIG_CHAR}->{($segment_end-1)})
|
1691 |
+
&& ($last_char_name = $ht{UTF_TO_CHAR_NAME}->{$last_char})
|
1692 |
+
&& ($last_char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET|BRAILLE PATTERN BLANK|TIBETAN MARK .*)$/)) {
|
1693 |
+
$marked_up_orig .= "<wbr>";
|
1694 |
+
$marked_up_rom .= "<wbr>";
|
1695 |
+
}
|
1696 |
+
}
|
1697 |
+
return ($marked_up_rom, $marked_up_orig, $last_group_id_index);
|
1698 |
+
}
|
1699 |
+
|
1700 |
+
sub romanizations_with_alternatives {
|
1701 |
+
local($this, *ht, *chart_ht, *pinyin_ht, $chart_start, $chart_end) = @_;
|
1702 |
+
|
1703 |
+
$chart_start = 0 unless defined($chart_start);
|
1704 |
+
$chart_end = $chart_ht{N_CHARS} unless defined($chart_end);
|
1705 |
+
my $result = "";
|
1706 |
+
my $start = $chart_start;
|
1707 |
+
my $end;
|
1708 |
+
# print STDOUT "romanizations_with_alternatives $chart_start-$chart_end\n";
|
1709 |
+
while ($start < $chart_end) {
|
1710 |
+
my $segment_start = $start;
|
1711 |
+
my $segment_end = $start+1;
|
1712 |
+
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1713 |
+
my $rom_segment = "";
|
1714 |
+
# print STDOUT " $start-$end\n";
|
1715 |
+
if ($end) {
|
1716 |
+
$segment_end = $end;
|
1717 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1718 |
+
# print STDOUT " $start-$end @best_romanizations\n";
|
1719 |
+
if (@best_romanizations) {
|
1720 |
+
if ($#best_romanizations == 0) {
|
1721 |
+
$rom_segment .= $best_romanizations[0];
|
1722 |
+
} else {
|
1723 |
+
$rom_segment .= "{" . join("|", @best_romanizations) . "}";
|
1724 |
+
}
|
1725 |
+
$segment_end = $end;
|
1726 |
+
} else {
|
1727 |
+
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1728 |
+
$rom_segment .= $segment;
|
1729 |
+
$segment_end = $start+1;
|
1730 |
+
}
|
1731 |
+
$start = $segment_end;
|
1732 |
+
} else {
|
1733 |
+
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
|
1734 |
+
$segment_end = $start+1;
|
1735 |
+
$start = $segment_end;
|
1736 |
+
}
|
1737 |
+
# print STDOUT " $start-$end ** $rom_segment\n";
|
1738 |
+
$result .= $rom_segment;
|
1739 |
+
}
|
1740 |
+
return $result;
|
1741 |
+
}
|
1742 |
+
|
1743 |
+
sub quick_romanize {
|
1744 |
+
local($this, $s, $lang_code, *ht) = @_;
|
1745 |
+
|
1746 |
+
my $result = "";
|
1747 |
+
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
1748 |
+
while (@chars) {
|
1749 |
+
my $found_match_in_table_p = 0;
|
1750 |
+
foreach $string_length (reverse(1..4)) {
|
1751 |
+
next if ($string_length-1) > $#chars;
|
1752 |
+
$multi_char_substring = join("", @chars[0..($string_length-1)]);
|
1753 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
1754 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
|
1755 |
+
if (@mappings) {
|
1756 |
+
my $mapping = $mappings[0];
|
1757 |
+
$result .= $mapping;
|
1758 |
+
foreach $_ ((1 .. $string_length)) {
|
1759 |
+
shift @chars;
|
1760 |
+
}
|
1761 |
+
$found_match_in_table_p = 1;
|
1762 |
+
last;
|
1763 |
+
}
|
1764 |
+
}
|
1765 |
+
unless ($found_match_in_table_p) {
|
1766 |
+
$result .= $chars[0];
|
1767 |
+
shift @chars;
|
1768 |
+
}
|
1769 |
+
}
|
1770 |
+
return $result;
|
1771 |
+
}
|
1772 |
+
|
1773 |
+
sub char_is_combining_char {
|
1774 |
+
local($this, $c, *ht) = @_;
|
1775 |
+
|
1776 |
+
return 0 unless $c;
|
1777 |
+
my $category = $ht{UTF_TO_CAT}->{$c};
|
1778 |
+
return 0 unless $category;
|
1779 |
+
return $category =~ /^M/;
|
1780 |
+
}
|
1781 |
+
|
1782 |
+
sub mark_up_string_for_mouse_over {
|
1783 |
+
local($this, $s, *ht, $control, *pinyin_ht) = @_;
|
1784 |
+
|
1785 |
+
$control = "" unless defined($control);
|
1786 |
+
$no_ascii_p = ($control =~ /NO-ASCII/);
|
1787 |
+
my $result = "";
|
1788 |
+
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
1789 |
+
while (@chars) {
|
1790 |
+
$char = shift @chars;
|
1791 |
+
$numeric = $ht{UTF_TO_NUMERIC}->{$char};
|
1792 |
+
$numeric = "" unless defined($numeric);
|
1793 |
+
$pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
|
1794 |
+
$pic_descr = "" unless defined($pic_descr);
|
1795 |
+
$next_char = ($#chars >= 0) ? $chars[0] : "";
|
1796 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1797 |
+
if ($no_ascii_p
|
1798 |
+
&& ($char =~ /^[\x00-\x7F]*$/)
|
1799 |
+
&& ! $next_char_is_combining_p) {
|
1800 |
+
$result .= $util->guard_html($char);
|
1801 |
+
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
|
1802 |
+
$unicode = $utf8->utf8_to_unicode($char);
|
1803 |
+
$title = "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode));
|
1804 |
+
$title .= "
Chinese: $tonal_translit" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
|
1805 |
+
$title .= "
Number: $numeric" if $numeric =~ /\d/;
|
1806 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
|
1807 |
+
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
|
1808 |
+
$title = $char_name;
|
1809 |
+
$title .= "
Number: $numeric" if $numeric =~ /\d/;
|
1810 |
+
$title .= "
Picture: $pic_descr" if $pic_descr =~ /\S/;
|
1811 |
+
$char_plus = $char;
|
1812 |
+
while ($next_char_is_combining_p) {
|
1813 |
+
# combining marks (Mc:non-spacing, Mc:spacing combining, Me: enclosing)
|
1814 |
+
$next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char};
|
1815 |
+
$title .= "
+ $next_char_name";
|
1816 |
+
$char = shift @chars;
|
1817 |
+
$char_plus .= $char;
|
1818 |
+
$next_char = ($#chars >= 0) ? $chars[0] : "";
|
1819 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1820 |
+
}
|
1821 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char_plus) . "<\/span>";
|
1822 |
+
$result .= "<wbr>" if $char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET)$/;
|
1823 |
+
} elsif (($unicode = $utf8->utf8_to_unicode($char))
|
1824 |
+
&& ($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
1825 |
+
$title = "Hangul syllable U+" . (uc sprintf("%04x", $unicode));
|
1826 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
|
1827 |
+
} else {
|
1828 |
+
$result .= $util->guard_html($char);
|
1829 |
+
}
|
1830 |
+
}
|
1831 |
+
return $result;
|
1832 |
+
}
|
1833 |
+
|
1834 |
+
sub romanize_char_at_position_incl_multi {
|
1835 |
+
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
|
1836 |
+
|
1837 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1838 |
+
return "" unless defined($char);
|
1839 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$char}};
|
1840 |
+
return $mappings[0] if @mappings;
|
1841 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$char}};
|
1842 |
+
return $mappings[0] if @mappings;
|
1843 |
+
return $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht);
|
1844 |
+
}
|
1845 |
+
|
1846 |
+
sub romanize_char_at_position {
|
1847 |
+
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
|
1848 |
+
|
1849 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1850 |
+
return "" unless defined($char);
|
1851 |
+
return $char if $char =~ /^[\x00-\x7F]$/; # ASCII
|
1852 |
+
my $romanization = $ht{UTF_TO_CHAR_ROMANIZATION}->{$char};
|
1853 |
+
return $romanization if $romanization;
|
1854 |
+
my $char_name = $chart_ht{CHAR_NAME}->{$i};
|
1855 |
+
$romanization = $this->romanize_charname($char_name, $lang_code, $output_style, *ht, $char);
|
1856 |
+
$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization}
|
1857 |
+
= ($ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} || 0) + 1
|
1858 |
+
unless (length($romanization) < 4)
|
1859 |
+
|| ($romanization =~ /\s/)
|
1860 |
+
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,3}[aeiou]-$/) # Khmer ngo-/nyo-/pho- OK
|
1861 |
+
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,2}[aeiougw][aeiou]{1,2}$/) # Canadian, Ethiopic syllable OK
|
1862 |
+
|| ($romanization =~ /^(allah|bbux|nyaa|nnya|quuv|rrep|shch|shur|syrx)$/i) # Arabic; Yi; Ethiopic syllable nyaa; Cyrillic letter shcha
|
1863 |
+
|| (($char_name =~ /^(YI SYLLABLE|VAI SYLLABLE|ETHIOPIC SYLLABLE|CANADIAN SYLLABICS|CANADIAN SYLLABICS CARRIER)\s+(\S+)$/) && (length($romanization) <= 5));
|
1864 |
+
# print STDERR "romanize_char_at_position $i $char_name :: $romanization\n" if $char_name =~ /middle/i;
|
1865 |
+
return $romanization;
|
1866 |
+
}
|
1867 |
+
|
1868 |
+
sub romanize_charname {
|
1869 |
+
local($this, $char_name, $lang_code, $output_style, *ht, $char) = @_;
|
1870 |
+
|
1871 |
+
my $cached_result = $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style};
|
1872 |
+
# print STDERR "(C) romanize_charname($char_name): $cached_result\n" if $cached_result && ($char_name =~ /middle/i);
|
1873 |
+
return $cached_result if defined($cashed_result);
|
1874 |
+
$orig_char_name = $char_name;
|
1875 |
+
$char_name =~ s/^.* LETTER\s+([A-Z]+)-\d+$/$1/; # HENTAIGANA LETTER A-3
|
1876 |
+
$char_name =~ s/^.* LETTER\s+//;
|
1877 |
+
$char_name =~ s/^.* SYLLABLE\s+B\d\d\d\s+//; # Linear B syllables
|
1878 |
+
$char_name =~ s/^.* SYLLABLE\s+//;
|
1879 |
+
$char_name =~ s/^.* SYLLABICS\s+//;
|
1880 |
+
$char_name =~ s/^.* LIGATURE\s+//;
|
1881 |
+
$char_name =~ s/^.* VOWEL SIGN\s+//;
|
1882 |
+
$char_name =~ s/^.* CONSONANT SIGN\s+//;
|
1883 |
+
$char_name =~ s/^.* CONSONANT\s+//;
|
1884 |
+
$char_name =~ s/^.* VOWEL\s+//;
|
1885 |
+
$char_name =~ s/ WITH .*$//;
|
1886 |
+
$char_name =~ s/ WITHOUT .*$//;
|
1887 |
+
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
|
1888 |
+
$char_name =~ s/^([A-Z]+)\d+$/$1/; # Linear B syllables etc.
|
1889 |
+
foreach $_ ((1 .. 3)) {
|
1890 |
+
$char_name =~ s/^.*\b(?:ABKHASIAN|ACADEMY|AFRICAN|AIVILIK|AITON|AKHMIMIC|ALEUT|ALI GALI|ALPAPRAANA|ALTERNATE|ALTERNATIVE|AMBA|ARABIC|ARCHAIC|ASPIRATED|ATHAPASCAN|BASELINE|BLACKLETTER|BARRED|BASHKIR|BERBER|BHATTIPROLU|BIBLE-CREE|BIG|BINOCULAR|BLACKFOOT|BLENDED|BOTTOM|BROAD|BROKEN|CANDRA|CAPITAL|CARRIER|CHILLU|CLOSE|CLOSED|COPTIC|CROSSED|CRYPTOGRAMMIC|CURLED|CURLY|CYRILLIC|DANTAJA|DENTAL|DIALECT-P|DIAERESIZED|DOTLESS|DOUBLE|DOUBLE-STRUCK|EASTERN PWO KAREN|EGYPTOLOGICAL|FARSI|FINAL|FLATTENED|GLOTTAL|GREAT|GREEK|HALF|HIGH|INITIAL|INSULAR|INVERTED|IOTIFIED|JONA|KANTAJA|KASHMIRI|KHAKASSIAN|KHAMTI|KHANDA|KINNA|KIRGHIZ|KOMI|L-SHAPED|LATINATE|LITTLE|LONG|LONG-LEGGED|LOOPED|LOW|MAHAAPRAANA|MALAYALAM|MANCHU|MANDAILING|MATHEMATICAL|MEDIAL|MIDDLE-WELSH|MON|MONOCULAR|MOOSE-CREE|MULTIOCULAR|MUURDHAJA|N-CREE|NARROW|NASKAPI|NDOLE|NEUTRAL|NIKOLSBURG|NORTHERN|NUBIAN|NUNAVIK|NUNAVUT|OJIBWAY|OLD|OPEN|ORKHON|OVERLONG|PALI|PERSIAN|PHARYNGEAL|PRISHTHAMATRA|R-CREE|REDUPLICATION|REVERSED|ROMANIAN|ROUND|ROUNDED|RUDIMENTA|RUMAI PALAUNG|SANSKRIT|SANYAKA|SARA|SAYISI|SCRIPT|SEBATBEIT|SEMISOFT|SGAW KAREN|SHAN|SHARP|SHWE PALAUNG|SHORT|SIBE|SIDEWAYS|SIMALUNGUN|SMALL|SOGDIAN|SOFT|SOUTH-SLAVEY|SOUTHERN|SPIDERY|STIRRUP|STRAIGHT|STRETCHED|SUBSCRIPT|SWASH|TAI LAING|TAILED|TAILLESS|TAALUJA|TH-CREE|TALL|THREE-LEGGED|TURNED|TODO|TOP|TROKUTASTI|TUAREG|UKRAINIAN|UNBLENDED|VISIGOTHIC|VOCALIC|VOICED|VOICELESS|VOLAPUK|WAVY|WESTERN PWO KAREN|WEST-CREE|WESTERN|WIDE|WOODS-CREE|Y-CREE|YENISEI|YIDDISH)\s+//;
|
1891 |
+
}
|
1892 |
+
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
|
1893 |
+
if ($char_name =~ /THAI CHARACTER/) {
|
1894 |
+
$char_name =~ s/^THAI CHARACTER\s+//;
|
1895 |
+
if ($char =~ /^\xE0\xB8[\x81-\xAE]/) {
|
1896 |
+
# Thai consonants
|
1897 |
+
$char_name =~ s/^([^AEIOU]*).*/$1/i;
|
1898 |
+
} elsif ($char_name =~ /^SARA [AEIOU]/) {
|
1899 |
+
# Thai vowels
|
1900 |
+
$char_name =~ s/^SARA\s+//;
|
1901 |
+
} else {
|
1902 |
+
$char_name = $char;
|
1903 |
+
}
|
1904 |
+
}
|
1905 |
+
if ($orig_char_name =~ /(HIRAGANA LETTER|KATAKANA LETTER|SYLLABLE|LIGATURE)/) {
|
1906 |
+
$char_name = lc $char_name;
|
1907 |
+
} elsif ($char_name =~ /\b(ANUSVARA|ANUSVARAYA|NIKAHIT|SIGN BINDI|TIPPI)\b/) {
|
1908 |
+
$char_name = "+m";
|
1909 |
+
} elsif ($char_name =~ /\bSCHWA\b/) {
|
1910 |
+
$char_name = "e";
|
1911 |
+
} elsif ($char_name =~ /\bIOTA\b/) {
|
1912 |
+
$char_name = "i";
|
1913 |
+
} elsif ($char_name =~ /\s/) {
|
1914 |
+
} elsif ($orig_char_name =~ /KHMER LETTER/) {
|
1915 |
+
$char_name .= "-";
|
1916 |
+
} elsif ($orig_char_name =~ /CHEROKEE LETTER/) {
|
1917 |
+
# use whole letter as is
|
1918 |
+
} elsif ($orig_char_name =~ /KHMER INDEPENDENT VOWEL/) {
|
1919 |
+
$char_name =~ s/q//;
|
1920 |
+
} elsif ($orig_char_name =~ /LETTER/) {
|
1921 |
+
$char_name =~ s/^[AEIOU]+([^AEIOU]+)$/$1/i;
|
1922 |
+
$char_name =~ s/^([^-AEIOUY]+)[AEIOU].*/$1/i;
|
1923 |
+
$char_name =~ s/^(Y)[AEIOU].*/$1/i if $orig_char_name =~ /\b(?:BENGALI|DEVANAGARI|GURMUKHI|GUJARATI|KANNADA|MALAYALAM|MODI|MYANMAR|ORIYA|TAMIL|TELUGU|TIBETAN)\b.*\bLETTER YA\b/;
|
1924 |
+
$char_name =~ s/^(Y[AEIOU]+)[^AEIOU].*$/$1/i;
|
1925 |
+
$char_name =~ s/^([AEIOU]+)[^AEIOU]+[AEIOU].*/$1/i;
|
1926 |
+
}
|
1927 |
+
|
1928 |
+
my $result = ($orig_char_name =~ /\bCAPITAL\b/) ? (uc $char_name) : (lc $char_name);
|
1929 |
+
# print STDERR "(R) romanize_charname($orig_char_name): $result\n" if $orig_char_name =~ /middle/i;
|
1930 |
+
$ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style} = $result;
|
1931 |
+
return $result;
|
1932 |
+
}
|
1933 |
+
|
1934 |
+
sub assemble_numbers_in_chart {
|
1935 |
+
local($this, *chart_ht, $line_number) = @_;
|
1936 |
+
|
1937 |
+
foreach $start (sort { $a <=> $b } keys %{$chart_ht{COMPLEX_NUMERIC_START_END}}) {
|
1938 |
+
my $end = $chart_ht{COMPLEX_NUMERIC_START_END}->{$start};
|
1939 |
+
my @numbers = ();
|
1940 |
+
foreach $i (($start .. ($end-1))) {
|
1941 |
+
my $orig_char = $chart_ht{ORIG_CHAR}->{$i};
|
1942 |
+
my $node_id = $this->get_node_for_span_with_slot($i, $i+1, "numeric-value", *chart_id);
|
1943 |
+
if (defined($node_id)) {
|
1944 |
+
my $number = $chart_ht{NODE_ROMAN}->{$node_id};
|
1945 |
+
if (defined($number)) {
|
1946 |
+
push(@numbers, $number);
|
1947 |
+
} elsif ($orig_char =~ /^[.,]$/) { # decimal point, comma separator
|
1948 |
+
push(@numbers, $orig_char);
|
1949 |
+
} else {
|
1950 |
+
print STDERR "Found no romanization for node_id $node_id ($i-" . ($i+1) . ") in assemble_numbers_in_chart\n" if $verbosePM;
|
1951 |
+
}
|
1952 |
+
} else {
|
1953 |
+
print STDERR "Found no node_id for span $i-" . ($i+1) . " in assemble_numbers_in_chart\n" if $verbosePM;
|
1954 |
+
}
|
1955 |
+
}
|
1956 |
+
my $complex_number = $this->assemble_number(join("\xC2\xB7", @numbers), $line_number);
|
1957 |
+
# print STDERR "assemble_numbers_in_chart l.$line_number $start-$end $complex_number (@numbers)\n";
|
1958 |
+
$this->add_node($complex_number, $start, $end, *chart_ht, "", "complex-number");
|
1959 |
+
}
|
1960 |
+
}
|
1961 |
+
|
1962 |
+
sub assemble_number {
|
1963 |
+
local($this, $s, $line_number) = @_;
|
1964 |
+
# e.g. 10 9 100 7 10 8 = 1978
|
1965 |
+
|
1966 |
+
my $middot = "\xC2\xB7";
|
1967 |
+
my @tokens = split(/$middot/, $s); # middle dot U+00B7
|
1968 |
+
my $i = 0;
|
1969 |
+
my @orig_tokens = @tokens;
|
1970 |
+
|
1971 |
+
# assemble single digit numbers, e.g. 1 7 5 -> 175
|
1972 |
+
while ($i < $#tokens) {
|
1973 |
+
if ($tokens[$i] =~ /^\d$/) {
|
1974 |
+
my $j = $i+1;
|
1975 |
+
while (($j <= $#tokens) && ($tokens[$j] =~ /^[0-9.,]$/)) {
|
1976 |
+
$j++;
|
1977 |
+
}
|
1978 |
+
$j--;
|
1979 |
+
if ($j>$i) {
|
1980 |
+
my $new_token = join("", @tokens[$i .. $j]);
|
1981 |
+
$new_token =~ s/,//g;
|
1982 |
+
splice(@tokens, $i, $j-$i+1, $new_token);
|
1983 |
+
}
|
1984 |
+
}
|
1985 |
+
$i++;
|
1986 |
+
}
|
1987 |
+
|
1988 |
+
foreach $power ((10, 100, 1000, 10000, 100000, 1000000, 100000000, 1000000000, 1000000000000)) {
|
1989 |
+
for (my $i=0; $i <= $#tokens; $i++) {
|
1990 |
+
if ($tokens[$i] == $power) {
|
1991 |
+
if (($i > 0) && ($tokens[($i-1)] < $power)) {
|
1992 |
+
splice(@tokens, $i-1, 2, ($tokens[($i-1)] * $tokens[$i]));
|
1993 |
+
$i--;
|
1994 |
+
if (($i < $#tokens) && ($tokens[($i+1)] < $power)) {
|
1995 |
+
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
|
1996 |
+
$i--;
|
1997 |
+
}
|
1998 |
+
}
|
1999 |
+
}
|
2000 |
+
# 400 30 (e.g. Egyptian)
|
2001 |
+
my $gen_pattern = $power;
|
2002 |
+
$gen_pattern =~ s/^1/\[1-9\]/;
|
2003 |
+
if (($tokens[$i] =~ /^$gen_pattern$/) && ($i < $#tokens) && ($tokens[($i+1)] < $power)) {
|
2004 |
+
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
|
2005 |
+
$i--;
|
2006 |
+
}
|
2007 |
+
}
|
2008 |
+
last if $#tokens == 0;
|
2009 |
+
}
|
2010 |
+
my $result = join($middot, @tokens);
|
2011 |
+
if ($verbosePM) {
|
2012 |
+
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-number-log.txt";
|
2013 |
+
$util->append_to_file($logfile, "$s -> $result\n") if -r $logfile;
|
2014 |
+
# print STDERR " assemble number l.$line_number @orig_tokens -> $result\n" if $line_number == 43;
|
2015 |
+
}
|
2016 |
+
return $result;
|
2017 |
+
}
|
2018 |
+
|
2019 |
+
1;
|
2020 |
+
|
uroman/lib/NLP/UTF8.pm
ADDED
@@ -0,0 +1,1404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# UTF8 #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::UTF8;
|
8 |
+
|
9 |
+
use NLP::utilities;
|
10 |
+
$util = NLP::utilities;
|
11 |
+
|
12 |
+
%empty_ht = ();
|
13 |
+
|
14 |
+
sub new {
|
15 |
+
local($caller) = @_;
|
16 |
+
|
17 |
+
my $object = {};
|
18 |
+
my $class = ref( $caller ) || $caller;
|
19 |
+
bless($object, $class);
|
20 |
+
return $object;
|
21 |
+
}
|
22 |
+
|
23 |
+
sub unicode_string2string {
|
24 |
+
# input: string that might contain unicode sequences such as "U+0627"
|
25 |
+
# output: string in pure utf-8
|
26 |
+
local($caller,$s) = @_;
|
27 |
+
|
28 |
+
my $pre;
|
29 |
+
my $unicode;
|
30 |
+
my $post;
|
31 |
+
my $r1;
|
32 |
+
my $r2;
|
33 |
+
my $r3;
|
34 |
+
|
35 |
+
($pre,$unicode,$post) = ($s =~ /^(.*)(?:U\+|\\u)([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(.*)$/);
|
36 |
+
return $s unless defined($post);
|
37 |
+
$r1 = $caller->unicode_string2string($pre);
|
38 |
+
$r2 = $caller->unicode_hex_string2string($unicode);
|
39 |
+
$r3 = $caller->unicode_string2string($post);
|
40 |
+
$result = $r1 . $r2 . $r3;
|
41 |
+
return $result;
|
42 |
+
}
|
43 |
+
|
44 |
+
sub unicode_hex_string2string {
|
45 |
+
# input: "0627" (interpreted as hex code)
|
46 |
+
# output: utf-8 string for Arabic letter alef
|
47 |
+
local($caller,$unicode) = @_;
|
48 |
+
return "" unless defined($unicode);
|
49 |
+
my $d = hex($unicode);
|
50 |
+
return $caller->unicode2string($d);
|
51 |
+
}
|
52 |
+
|
53 |
+
sub unicode2string {
|
54 |
+
# input: non-neg integer, e.g. 0x627
|
55 |
+
# output: utf-8 string for Arabic letter alef
|
56 |
+
local($caller,$d) = @_;
|
57 |
+
return "" unless defined($d) && $d >= 0;
|
58 |
+
return sprintf("%c",$d) if $d <= 0x7F;
|
59 |
+
|
60 |
+
my $lastbyte1 = ($d & 0x3F) | 0x80;
|
61 |
+
$d >>= 6;
|
62 |
+
return sprintf("%c%c",$d | 0xC0, $lastbyte1) if $d <= 0x1F;
|
63 |
+
|
64 |
+
my $lastbyte2 = ($d & 0x3F) | 0x80;
|
65 |
+
$d >>= 6;
|
66 |
+
return sprintf("%c%c%c",$d | 0xE0, $lastbyte2, $lastbyte1) if $d <= 0xF;
|
67 |
+
|
68 |
+
my $lastbyte3 = ($d & 0x3F) | 0x80;
|
69 |
+
$d >>= 6;
|
70 |
+
return sprintf("%c%c%c%c",$d | 0xF0, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x7;
|
71 |
+
|
72 |
+
my $lastbyte4 = ($d & 0x3F) | 0x80;
|
73 |
+
$d >>= 6;
|
74 |
+
return sprintf("%c%c%c%c%c",$d | 0xF8, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x3;
|
75 |
+
|
76 |
+
my $lastbyte5 = ($d & 0x3F) | 0x80;
|
77 |
+
$d >>= 6;
|
78 |
+
return sprintf("%c%c%c%c%c%c",$d | 0xFC, $lastbyte5, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x1;
|
79 |
+
return ""; # bad input
|
80 |
+
}
|
81 |
+
|
82 |
+
sub html2utf8 {
|
83 |
+
local($caller, $string) = @_;
|
84 |
+
|
85 |
+
return $string unless $string =~ /\&\#\d{3,5};/;
|
86 |
+
|
87 |
+
my $prev = "";
|
88 |
+
my $s = $string;
|
89 |
+
while ($s ne $prev) {
|
90 |
+
$prev = $s;
|
91 |
+
($pre,$d,$post) = ($s =~ /^(.*)\&\#(\d+);(.*)$/);
|
92 |
+
if (defined($d) && ((($d >= 160) && ($d <= 255))
|
93 |
+
|| (($d >= 1500) && ($d <= 1699))
|
94 |
+
|| (($d >= 19968) && ($d <= 40879)))) {
|
95 |
+
$html_code = "\&\#" . $d . ";";
|
96 |
+
$utf8_code = $caller->unicode2string($d);
|
97 |
+
$s =~ s/$html_code/$utf8_code/;
|
98 |
+
}
|
99 |
+
}
|
100 |
+
return $s;
|
101 |
+
}
|
102 |
+
|
103 |
+
sub xhtml2utf8 {
|
104 |
+
local($caller, $string) = @_;
|
105 |
+
|
106 |
+
return $string unless $string =~ /\&\#x[0-9a-fA-F]{2,5};/;
|
107 |
+
|
108 |
+
my $prev = "";
|
109 |
+
my $s = $string;
|
110 |
+
while ($s ne $prev) {
|
111 |
+
$prev = $s;
|
112 |
+
if (($pre, $html_code, $x, $post) = ($s =~ /^(.*)(\&\#x([0-9a-fA-F]{2,5});)(.*)$/)) {
|
113 |
+
$utf8_code = $caller->unicode_hex_string2string($x);
|
114 |
+
$s =~ s/$html_code/$utf8_code/;
|
115 |
+
}
|
116 |
+
}
|
117 |
+
return $s;
|
118 |
+
}
|
119 |
+
|
120 |
+
sub utf8_marker {
|
121 |
+
return sprintf("%c%c%c\n", 0xEF, 0xBB, 0xBF);
|
122 |
+
}
|
123 |
+
|
124 |
+
sub enforcer {
|
125 |
+
# input: string that might not conform to utf-8
|
126 |
+
# output: string in pure utf-8, with a few "smart replacements" and possibly "?"
|
127 |
+
local($caller,$s,$no_repair) = @_;
|
128 |
+
|
129 |
+
my $ascii;
|
130 |
+
my $utf8;
|
131 |
+
my $rest;
|
132 |
+
|
133 |
+
return $s if $s =~ /^[\x00-\x7F]*$/;
|
134 |
+
|
135 |
+
$no_repair = 0 unless defined($no_repair);
|
136 |
+
$orig = $s;
|
137 |
+
$result = "";
|
138 |
+
|
139 |
+
while ($s ne "") {
|
140 |
+
($ascii,$rest) = ($s =~ /^([\x00-\x7F]+)(.*)$/);
|
141 |
+
if (defined($ascii)) {
|
142 |
+
$result .= $ascii;
|
143 |
+
$s = $rest;
|
144 |
+
next;
|
145 |
+
}
|
146 |
+
($utf8,$rest) = ($s =~ /^([\xC0-\xDF][\x80-\xBF])(.*)$/);
|
147 |
+
($utf8,$rest) = ($s =~ /^([\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
148 |
+
unless defined($rest);
|
149 |
+
($utf8,$rest) = ($s =~ /^([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
150 |
+
unless defined($rest);
|
151 |
+
($utf8,$rest) = ($s =~ /^([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
152 |
+
unless defined($rest);
|
153 |
+
if (defined($utf8)) {
|
154 |
+
$result .= $utf8;
|
155 |
+
$s = $rest;
|
156 |
+
next;
|
157 |
+
}
|
158 |
+
($c,$rest) = ($s =~ /^(.)(.*)$/);
|
159 |
+
if (defined($c)) {
|
160 |
+
if ($no_repair) { $result .= "?"; }
|
161 |
+
elsif ($c =~ /\x85/) { $result .= "..."; }
|
162 |
+
elsif ($c =~ /\x91/) { $result .= "'"; }
|
163 |
+
elsif ($c =~ /\x92/) { $result .= "'"; }
|
164 |
+
elsif ($c =~ /\x93/) { $result .= $caller->unicode2string(0x201C); }
|
165 |
+
elsif ($c =~ /\x94/) { $result .= $caller->unicode2string(0x201D); }
|
166 |
+
elsif ($c =~ /[\xC0-\xFF]/) {
|
167 |
+
$c2 = $c;
|
168 |
+
$c2 =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
169 |
+
$result .= "\xC3$c2";
|
170 |
+
} else {
|
171 |
+
$result .= "?";
|
172 |
+
}
|
173 |
+
$s = $rest;
|
174 |
+
next;
|
175 |
+
}
|
176 |
+
$s = "";
|
177 |
+
}
|
178 |
+
$result .= "\n" if ($orig =~ /\n$/) && ! ($result =~ /\n$/);
|
179 |
+
return $result;
|
180 |
+
}
|
181 |
+
|
182 |
+
sub split_into_utf8_characters {
|
183 |
+
# input: utf8 string
|
184 |
+
# output: list of sub-strings, each representing a utf8 character
|
185 |
+
local($caller,$string,$group_control, *ht) = @_;
|
186 |
+
|
187 |
+
@characters = ();
|
188 |
+
$end_of_token_p_string = "";
|
189 |
+
$skipped_bytes = "";
|
190 |
+
$group_control = "" unless defined($group_control);
|
191 |
+
$group_ascii_numbers = ($group_control =~ /ASCII numbers/);
|
192 |
+
$group_ascii_spaces = ($group_control =~ /ASCII spaces/);
|
193 |
+
$group_ascii_punct = ($group_control =~ /ASCII punct/);
|
194 |
+
$group_ascii_chars = ($group_control =~ /ASCII chars/);
|
195 |
+
$group_xml_chars = ($group_control =~ /XML chars/);
|
196 |
+
$group_xml_tags = ($group_control =~ /XML tags/);
|
197 |
+
$return_only_chars = ($group_control =~ /return only chars/);
|
198 |
+
$return_trailing_whitespaces = ($group_control =~ /return trailing whitespaces/);
|
199 |
+
if ($group_control =~ /ASCII all/) {
|
200 |
+
$group_ascii_numbers = 1;
|
201 |
+
$group_ascii_spaces = 1;
|
202 |
+
$group_ascii_chars = 1;
|
203 |
+
$group_ascii_punct = 1;
|
204 |
+
}
|
205 |
+
if ($group_control =~ /(XML chars and tags|XML tags and chars)/) {
|
206 |
+
$group_xml_chars = 1;
|
207 |
+
$group_xml_tags = 1;
|
208 |
+
}
|
209 |
+
$orig_string = $string;
|
210 |
+
$string .= " ";
|
211 |
+
while ($string =~ /\S/) {
|
212 |
+
# one-character UTF-8 = ASCII
|
213 |
+
if ($string =~ /^[\x00-\x7F]/) {
|
214 |
+
if ($group_xml_chars
|
215 |
+
&& (($dec_unicode, $rest) = ($string =~ /^&#(\d+);(.*)$/s))
|
216 |
+
&& ($utf8_char = $caller->unicode2string($dec_unicode))) {
|
217 |
+
push(@characters, $utf8_char);
|
218 |
+
$string = $rest;
|
219 |
+
} elsif ($group_xml_chars
|
220 |
+
&& (($hex_unicode, $rest) = ($string =~ /^&#x([0-9a-f]{1,6});(.*)$/is))
|
221 |
+
&& ($utf8_char = $caller->unicode_hex_string2string($hex_unicode))) {
|
222 |
+
push(@characters, $utf8_char);
|
223 |
+
$string = $rest;
|
224 |
+
} elsif ($group_xml_chars
|
225 |
+
&& (($html_entity_name, $rest) = ($string =~ /^&([a-z]{1,6});(.*)$/is))
|
226 |
+
&& ($dec_unicode = $ht{HTML_ENTITY_NAME_TO_DECUNICODE}->{$html_entity_name})
|
227 |
+
&& ($utf8_char = $caller->unicode2string($dec_unicode))
|
228 |
+
) {
|
229 |
+
push(@characters, $utf8_char);
|
230 |
+
$string = $rest;
|
231 |
+
} elsif ($group_xml_tags
|
232 |
+
&& (($tag, $rest) = ($string =~ /^(<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>)(.*)$/s))) {
|
233 |
+
push(@characters, $tag);
|
234 |
+
$string = $rest;
|
235 |
+
} elsif ($group_ascii_numbers && ($string =~ /^[12]\d\d\d\.[01]?\d.[0-3]?\d([^0-9].*)?$/)) {
|
236 |
+
($date) = ($string =~ /^(\d\d\d\d\.\d?\d.\d?\d)([^0-9].*)?$/);
|
237 |
+
push(@characters,$date);
|
238 |
+
$string = substr($string, length($date));
|
239 |
+
} elsif ($group_ascii_numbers && ($string =~ /^\d/)) {
|
240 |
+
($number) = ($string =~ /^(\d+(,\d\d\d)*(\.\d+)?)/);
|
241 |
+
push(@characters,$number);
|
242 |
+
$string = substr($string, length($number));
|
243 |
+
} elsif ($group_ascii_spaces && ($string =~ /^(\s+)/)) {
|
244 |
+
($space) = ($string =~ /^(\s+)/);
|
245 |
+
$string = substr($string, length($space));
|
246 |
+
} elsif ($group_ascii_punct && (($punct_seq) = ($string =~ /^(-+|\.+|[:,%()"])/))) {
|
247 |
+
push(@characters,$punct_seq);
|
248 |
+
$string = substr($string, length($punct_seq));
|
249 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(\$[A-Z]*|[A-Z]{1,3}\$)/))) {
|
250 |
+
push(@characters,$word);
|
251 |
+
$string = substr($string, length($word));
|
252 |
+
} elsif ($group_ascii_chars && (($abbrev) = ($string =~ /^((?:Jan|Feb|Febr|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|Mr|Mrs|Dr|a.m|p.m)\.)/))) {
|
253 |
+
push(@characters,$abbrev);
|
254 |
+
$string = substr($string, length($abbrev));
|
255 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(second|minute|hour|day|week|month|year|inch|foot|yard|meter|kilometer|mile)-(?:long|old)/i))) {
|
256 |
+
push(@characters,$word);
|
257 |
+
$string = substr($string, length($word));
|
258 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)-/i))) {
|
259 |
+
push(@characters,$word);
|
260 |
+
$string = substr($string, length($word));
|
261 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^([a-zA-Z]+)(?:[ ,;%?|()"]|'s |' |\. |\d+[:hms][0-9 ])/))) {
|
262 |
+
push(@characters,$word);
|
263 |
+
$string = substr($string, length($word));
|
264 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x21-\x27\x2A-\x7E]+)/)) { # exclude ()
|
265 |
+
($ascii) = ($string =~ /^([\x21-\x27\x2A-\x7E]+)/); # ASCII black-characters
|
266 |
+
push(@characters,$ascii);
|
267 |
+
$string = substr($string, length($ascii));
|
268 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x21-\x7E]+)/)) {
|
269 |
+
($ascii) = ($string =~ /^([\x21-\x7E]+)/); # ASCII black-characters
|
270 |
+
push(@characters,$ascii);
|
271 |
+
$string = substr($string, length($ascii));
|
272 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x00-\x7F]+)/)) {
|
273 |
+
($ascii) = ($string =~ /^([\x00-\x7F]+)/);
|
274 |
+
push(@characters,$ascii);
|
275 |
+
$string = substr($string, length($ascii));
|
276 |
+
} else {
|
277 |
+
push(@characters,substr($string, 0, 1));
|
278 |
+
$string = substr($string, 1);
|
279 |
+
}
|
280 |
+
|
281 |
+
# two-character UTF-8
|
282 |
+
} elsif ($string =~ /^[\xC0-\xDF][\x80-\xBF]/) {
|
283 |
+
push(@characters,substr($string, 0, 2));
|
284 |
+
$string = substr($string, 2);
|
285 |
+
|
286 |
+
# three-character UTF-8
|
287 |
+
} elsif ($string =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) {
|
288 |
+
push(@characters,substr($string, 0, 3));
|
289 |
+
$string = substr($string, 3);
|
290 |
+
|
291 |
+
# four-character UTF-8
|
292 |
+
} elsif ($string =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
293 |
+
push(@characters,substr($string, 0, 4));
|
294 |
+
$string = substr($string, 4);
|
295 |
+
|
296 |
+
# five-character UTF-8
|
297 |
+
} elsif ($string =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
298 |
+
push(@characters,substr($string, 0, 5));
|
299 |
+
$string = substr($string, 5);
|
300 |
+
|
301 |
+
# six-character UTF-8
|
302 |
+
} elsif ($string =~ /^[\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
303 |
+
push(@characters,substr($string, 0, 6));
|
304 |
+
$string = substr($string, 6);
|
305 |
+
|
306 |
+
# not a UTF-8 character
|
307 |
+
} else {
|
308 |
+
$skipped_bytes .= substr($string, 0, 1);
|
309 |
+
$string = substr($string, 1);
|
310 |
+
}
|
311 |
+
|
312 |
+
$end_of_token_p_string .= ($string =~ /^\S/) ? "0" : "1"
|
313 |
+
if $#characters >= length($end_of_token_p_string);
|
314 |
+
}
|
315 |
+
$string =~ s/ $//; # remove previously added space, but keep original spaces
|
316 |
+
if ($return_trailing_whitespaces) {
|
317 |
+
while ($string =~ /^[ \t]/) {
|
318 |
+
push(@characters,substr($string, 0, 1));
|
319 |
+
$string = substr($string, 1);
|
320 |
+
}
|
321 |
+
push(@characters, "\n") if $orig_string =~ /\n$/;
|
322 |
+
}
|
323 |
+
return ($return_only_chars) ? @characters : ($skipped_bytes, $end_of_token_p_string, @characters);
|
324 |
+
}
|
325 |
+
|
326 |
+
sub max_substring_info {
|
327 |
+
local($caller,$s1,$s2,$info_type) = @_;
|
328 |
+
|
329 |
+
($skipped_bytes1, $end_of_token_p_string1, @char_list1) = $caller->split_into_utf8_characters($s1, "", *empty_ht);
|
330 |
+
($skipped_bytes2, $end_of_token_p_string2, @char_list2) = $caller->split_into_utf8_characters($s2, "", *empty_ht);
|
331 |
+
return 0 if $skipped_bytes1 || $skipped_bytes2;
|
332 |
+
|
333 |
+
$best_substring_start1 = 0;
|
334 |
+
$best_substring_start2 = 0;
|
335 |
+
$best_substring_length = 0;
|
336 |
+
|
337 |
+
foreach $start_pos2 ((0 .. $#char_list2)) {
|
338 |
+
last if $start_pos2 + $best_substring_length > $#char_list2;
|
339 |
+
foreach $start_pos1 ((0 .. $#char_list1)) {
|
340 |
+
last if $start_pos1 + $best_substring_length > $#char_list1;
|
341 |
+
$matching_length = 0;
|
342 |
+
while (($start_pos1 + $matching_length <= $#char_list1)
|
343 |
+
&& ($start_pos2 + $matching_length <= $#char_list2)
|
344 |
+
&& ($char_list1[$start_pos1+$matching_length] eq $char_list2[$start_pos2+$matching_length])) {
|
345 |
+
$matching_length++;
|
346 |
+
}
|
347 |
+
if ($matching_length > $best_substring_length) {
|
348 |
+
$best_substring_length = $matching_length;
|
349 |
+
$best_substring_start1 = $start_pos1;
|
350 |
+
$best_substring_start2 = $start_pos2;
|
351 |
+
}
|
352 |
+
}
|
353 |
+
}
|
354 |
+
if ($info_type =~ /^max-ratio1$/) {
|
355 |
+
$length1 = $#char_list1 + 1;
|
356 |
+
return ($length1 > 0) ? ($best_substring_length / $length1) : 0;
|
357 |
+
} elsif ($info_type =~ /^max-ratio2$/) {
|
358 |
+
$length2 = $#char_list2 + 1;
|
359 |
+
return ($length2 > 0) ? ($best_substring_length / $length2) : 0;
|
360 |
+
} elsif ($info_type =~ /^substring$/) {
|
361 |
+
return join("", @char_list1[$best_substring_start1 .. $best_substring_start1+$best_substring_length-1]);
|
362 |
+
} else {
|
363 |
+
$length1 = $#char_list1 + 1;
|
364 |
+
$length2 = $#char_list2 + 1;
|
365 |
+
$info = "s1=$s1;s2=$s2";
|
366 |
+
$info .= ";best_substring_length=$best_substring_length";
|
367 |
+
$info .= ";best_substring_start1=$best_substring_start1";
|
368 |
+
$info .= ";best_substring_start2=$best_substring_start2";
|
369 |
+
$info .= ";length1=$length1";
|
370 |
+
$info .= ";length2=$length2";
|
371 |
+
return $info;
|
372 |
+
}
|
373 |
+
}
|
374 |
+
|
375 |
+
sub n_shared_chars_at_start {
|
376 |
+
local($caller,$s1,$s2) = @_;
|
377 |
+
|
378 |
+
my $n = 0;
|
379 |
+
while (($s1 ne "") && ($s2 ne "")) {
|
380 |
+
($c1, $rest1) = ($s1 =~ /^(.[\x80-\xBF]*)(.*)$/);
|
381 |
+
($c2, $rest2) = ($s2 =~ /^(.[\x80-\xBF]*)(.*)$/);
|
382 |
+
if ($c1 eq $c2) {
|
383 |
+
$n++;
|
384 |
+
$s1 = $rest1;
|
385 |
+
$s2 = $rest2;
|
386 |
+
} else {
|
387 |
+
last;
|
388 |
+
}
|
389 |
+
}
|
390 |
+
return $n;
|
391 |
+
}
|
392 |
+
|
393 |
+
sub char_length {
|
394 |
+
local($caller,$string,$byte_offset) = @_;
|
395 |
+
|
396 |
+
my $char = ($byte_offset) ? substr($string, $byte_offset) : $string;
|
397 |
+
return 1 if $char =~ /^[\x00-\x7F]/;
|
398 |
+
return 2 if $char =~ /^[\xC0-\xDF]/;
|
399 |
+
return 3 if $char =~ /^[\xE0-\xEF]/;
|
400 |
+
return 4 if $char =~ /^[\xF0-\xF7]/;
|
401 |
+
return 5 if $char =~ /^[\xF8-\xFB]/;
|
402 |
+
return 6 if $char =~ /^[\xFC-\xFD]/;
|
403 |
+
return 0;
|
404 |
+
}
|
405 |
+
|
406 |
+
sub length_in_utf8_chars {
|
407 |
+
local($caller,$s) = @_;
|
408 |
+
|
409 |
+
$s =~ s/[\x80-\xBF]//g;
|
410 |
+
$s =~ s/[\x00-\x7F\xC0-\xFF]/c/g;
|
411 |
+
return length($s);
|
412 |
+
}
|
413 |
+
|
414 |
+
sub byte_length_of_n_chars {
|
415 |
+
local($caller,$char_length,$string,$byte_offset,$undef_return_value) = @_;
|
416 |
+
|
417 |
+
$byte_offset = 0 unless defined($byte_offset);
|
418 |
+
$undef_return_value = -1 unless defined($undef_return_value);
|
419 |
+
my $result = 0;
|
420 |
+
my $len;
|
421 |
+
foreach $i ((1 .. $char_length)) {
|
422 |
+
$len = $caller->char_length($string,($byte_offset+$result));
|
423 |
+
return $undef_return_value unless $len;
|
424 |
+
$result += $len;
|
425 |
+
}
|
426 |
+
return $result;
|
427 |
+
}
|
428 |
+
|
429 |
+
sub replace_non_ASCII_bytes {
|
430 |
+
local($caller,$string,$replacement) = @_;
|
431 |
+
|
432 |
+
$replacement = "HEX" unless defined($replacement);
|
433 |
+
if ($replacement =~ /^(Unicode|U\+4|\\u|HEX)$/) {
|
434 |
+
$new_string = "";
|
435 |
+
while (($pre,$utf8_char, $post) = ($string =~ /^([\x09\x0A\x20-\x7E]*)([\x00-\x08\x0B-\x1F\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF8-\xFF][\x80-\xBF]+|[\x80-\xBF])(.*)$/s)) {
|
436 |
+
if ($replacement =~ /Unicode/) {
|
437 |
+
$new_string .= $pre . "<U" . (uc $caller->utf8_to_unicode($utf8_char)) . ">";
|
438 |
+
} elsif ($replacement =~ /\\u/) {
|
439 |
+
$new_string .= $pre . "\\u" . (uc sprintf("%04x", $caller->utf8_to_unicode($utf8_char)));
|
440 |
+
} elsif ($replacement =~ /U\+4/) {
|
441 |
+
$new_string .= $pre . "<U+" . (uc $caller->utf8_to_4hex_unicode($utf8_char)) . ">";
|
442 |
+
} else {
|
443 |
+
$new_string .= $pre . "<HEX-" . $caller->utf8_to_hex($utf8_char) . ">";
|
444 |
+
}
|
445 |
+
$string = $post;
|
446 |
+
}
|
447 |
+
$new_string .= $string;
|
448 |
+
} else {
|
449 |
+
$new_string = $string;
|
450 |
+
$new_string =~ s/[\x80-\xFF]/$replacement/g;
|
451 |
+
}
|
452 |
+
return $new_string;
|
453 |
+
}
|
454 |
+
|
455 |
+
sub valid_utf8_string_p {
|
456 |
+
local($caller,$string) = @_;
|
457 |
+
|
458 |
+
return $string =~ /^(?:[\x09\x0A\x20-\x7E]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
|
459 |
+
}
|
460 |
+
|
461 |
+
sub valid_utf8_string_incl_ascii_control_p {
|
462 |
+
local($caller,$string) = @_;
|
463 |
+
|
464 |
+
return $string =~ /^(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
|
465 |
+
}
|
466 |
+
|
467 |
+
sub utf8_to_hex {
|
468 |
+
local($caller,$s) = @_;
|
469 |
+
|
470 |
+
$hex = "";
|
471 |
+
foreach $i ((0 .. length($s)-1)) {
|
472 |
+
$hex .= uc sprintf("%2.2x",ord(substr($s, $i, 1)));
|
473 |
+
}
|
474 |
+
return $hex;
|
475 |
+
}
|
476 |
+
|
477 |
+
sub hex_to_utf8 {
|
478 |
+
local($caller,$s) = @_;
|
479 |
+
# surface string \xE2\x80\xBA to UTF8
|
480 |
+
|
481 |
+
my $utf8 = "";
|
482 |
+
while (($hex, $rest) = ($s =~ /^(?:\\x)?([0-9A-Fa-f]{2,2})(.*)$/)) {
|
483 |
+
$utf8 .= sprintf("%c", hex($hex));
|
484 |
+
$s = $rest;
|
485 |
+
}
|
486 |
+
return $utf8;
|
487 |
+
}
|
488 |
+
|
489 |
+
sub utf8_to_4hex_unicode {
|
490 |
+
local($caller,$s) = @_;
|
491 |
+
|
492 |
+
return sprintf("%4.4x", $caller->utf8_to_unicode($s));
|
493 |
+
}
|
494 |
+
|
495 |
+
sub utf8_to_unicode {
|
496 |
+
local($caller,$s) = @_;
|
497 |
+
|
498 |
+
$unicode = 0;
|
499 |
+
foreach $i ((0 .. length($s)-1)) {
|
500 |
+
$c = substr($s, $i, 1);
|
501 |
+
if ($c =~ /^[\x80-\xBF]$/) {
|
502 |
+
$unicode = $unicode * 64 + (ord($c) & 0x3F);
|
503 |
+
} elsif ($c =~ /^[\xC0-\xDF]$/) {
|
504 |
+
$unicode = $unicode * 32 + (ord($c) & 0x1F);
|
505 |
+
} elsif ($c =~ /^[\xE0-\xEF]$/) {
|
506 |
+
$unicode = $unicode * 16 + (ord($c) & 0x0F);
|
507 |
+
} elsif ($c =~ /^[\xF0-\xF7]$/) {
|
508 |
+
$unicode = $unicode * 8 + (ord($c) & 0x07);
|
509 |
+
} elsif ($c =~ /^[\xF8-\xFB]$/) {
|
510 |
+
$unicode = $unicode * 4 + (ord($c) & 0x03);
|
511 |
+
} elsif ($c =~ /^[\xFC-\xFD]$/) {
|
512 |
+
$unicode = $unicode * 2 + (ord($c) & 0x01);
|
513 |
+
}
|
514 |
+
}
|
515 |
+
return $unicode;
|
516 |
+
}
|
517 |
+
|
518 |
+
sub charhex {
|
519 |
+
local($caller,$string) = @_;
|
520 |
+
|
521 |
+
my $result = "";
|
522 |
+
while ($string ne "") {
|
523 |
+
$char = substr($string, 0, 1);
|
524 |
+
$string = substr($string, 1);
|
525 |
+
if ($char =~ /^[ -~]$/) {
|
526 |
+
$result .= $char;
|
527 |
+
} else {
|
528 |
+
$hex = sprintf("%2.2x",ord($char));
|
529 |
+
$hex =~ tr/a-f/A-F/;
|
530 |
+
$result .= "<HEX-$hex>";
|
531 |
+
}
|
532 |
+
}
|
533 |
+
return $result;
|
534 |
+
}
|
535 |
+
|
536 |
+
sub windows1252_to_utf8 {
|
537 |
+
local($caller,$s, $norm_to_ascii_p, $preserve_potential_utf8s_p) = @_;
|
538 |
+
|
539 |
+
return $s if $s =~ /^[\x00-\x7F]*$/; # all ASCII
|
540 |
+
|
541 |
+
$norm_to_ascii_p = 1 unless defined($norm_to_ascii_p);
|
542 |
+
$preserve_potential_utf8s_p = 1 unless defined($preserve_potential_utf8s_p);
|
543 |
+
my $result = "";
|
544 |
+
my $c = "";
|
545 |
+
while ($s ne "") {
|
546 |
+
$n_bytes = 1;
|
547 |
+
if ($s =~ /^[\x00-\x7F]/) {
|
548 |
+
$result .= substr($s, 0, 1); # ASCII
|
549 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xC0-\xDF][\x80-\xBF]/)) {
|
550 |
+
$result .= substr($s, 0, 2); # valid 2-byte UTF8
|
551 |
+
$n_bytes = 2;
|
552 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/)) {
|
553 |
+
$result .= substr($s, 0, 3); # valid 3-byte UTF8
|
554 |
+
$n_bytes = 3;
|
555 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
|
556 |
+
$result .= substr($s, 0, 4); # valid 4-byte UTF8
|
557 |
+
$n_bytes = 4;
|
558 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
|
559 |
+
$result .= substr($s, 0, 5); # valid 5-byte UTF8
|
560 |
+
$n_bytes = 5;
|
561 |
+
} elsif ($s =~ /^[\xA0-\xBF]/) {
|
562 |
+
$c = substr($s, 0, 1);
|
563 |
+
$result .= "\xC2$c";
|
564 |
+
} elsif ($s =~ /^[\xC0-\xFF]/) {
|
565 |
+
$c = substr($s, 0, 1);
|
566 |
+
$c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
567 |
+
$result .= "\xC3$c";
|
568 |
+
} elsif ($s =~ /^\x80/) {
|
569 |
+
$result .= "\xE2\x82\xAC"; # Euro sign
|
570 |
+
} elsif ($s =~ /^\x82/) {
|
571 |
+
$result .= "\xE2\x80\x9A"; # single low quotation mark
|
572 |
+
} elsif ($s =~ /^\x83/) {
|
573 |
+
$result .= "\xC6\x92"; # Latin small letter f with hook
|
574 |
+
} elsif ($s =~ /^\x84/) {
|
575 |
+
$result .= "\xE2\x80\x9E"; # double low quotation mark
|
576 |
+
} elsif ($s =~ /^\x85/) {
|
577 |
+
$result .= ($norm_to_ascii_p) ? "..." : "\xE2\x80\xA6"; # horizontal ellipsis (three dots)
|
578 |
+
} elsif ($s =~ /^\x86/) {
|
579 |
+
$result .= "\xE2\x80\xA0"; # dagger
|
580 |
+
} elsif ($s =~ /^\x87/) {
|
581 |
+
$result .= "\xE2\x80\xA1"; # double dagger
|
582 |
+
} elsif ($s =~ /^\x88/) {
|
583 |
+
$result .= "\xCB\x86"; # circumflex
|
584 |
+
} elsif ($s =~ /^\x89/) {
|
585 |
+
$result .= "\xE2\x80\xB0"; # per mille sign
|
586 |
+
} elsif ($s =~ /^\x8A/) {
|
587 |
+
$result .= "\xC5\xA0"; # Latin capital letter S with caron
|
588 |
+
} elsif ($s =~ /^\x8B/) {
|
589 |
+
$result .= "\xE2\x80\xB9"; # single left-pointing angle quotation mark
|
590 |
+
} elsif ($s =~ /^\x8C/) {
|
591 |
+
$result .= "\xC5\x92"; # OE ligature
|
592 |
+
} elsif ($s =~ /^\x8E/) {
|
593 |
+
$result .= "\xC5\xBD"; # Latin capital letter Z with caron
|
594 |
+
} elsif ($s =~ /^\x91/) {
|
595 |
+
$result .= ($norm_to_ascii_p) ? "`" : "\xE2\x80\x98"; # left single quotation mark
|
596 |
+
} elsif ($s =~ /^\x92/) {
|
597 |
+
$result .= ($norm_to_ascii_p) ? "'" : "\xE2\x80\x99"; # right single quotation mark
|
598 |
+
} elsif ($s =~ /^\x93/) {
|
599 |
+
$result .= "\xE2\x80\x9C"; # left double quotation mark
|
600 |
+
} elsif ($s =~ /^\x94/) {
|
601 |
+
$result .= "\xE2\x80\x9D"; # right double quotation mark
|
602 |
+
} elsif ($s =~ /^\x95/) {
|
603 |
+
$result .= "\xE2\x80\xA2"; # bullet
|
604 |
+
} elsif ($s =~ /^\x96/) {
|
605 |
+
$result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x93"; # n dash
|
606 |
+
} elsif ($s =~ /^\x97/) {
|
607 |
+
$result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x94"; # m dash
|
608 |
+
} elsif ($s =~ /^\x98/) {
|
609 |
+
$result .= ($norm_to_ascii_p) ? "~" : "\xCB\x9C"; # small tilde
|
610 |
+
} elsif ($s =~ /^\x99/) {
|
611 |
+
$result .= "\xE2\x84\xA2"; # trade mark sign
|
612 |
+
} elsif ($s =~ /^\x9A/) {
|
613 |
+
$result .= "\xC5\xA1"; # Latin small letter s with caron
|
614 |
+
} elsif ($s =~ /^\x9B/) {
|
615 |
+
$result .= "\xE2\x80\xBA"; # single right-pointing angle quotation mark
|
616 |
+
} elsif ($s =~ /^\x9C/) {
|
617 |
+
$result .= "\xC5\x93"; # oe ligature
|
618 |
+
} elsif ($s =~ /^\x9E/) {
|
619 |
+
$result .= "\xC5\xBE"; # Latin small letter z with caron
|
620 |
+
} elsif ($s =~ /^\x9F/) {
|
621 |
+
$result .= "\xC5\xB8"; # Latin capital letter Y with diaeresis
|
622 |
+
} else {
|
623 |
+
$result .= "?";
|
624 |
+
}
|
625 |
+
$s = substr($s, $n_bytes);
|
626 |
+
}
|
627 |
+
return $result;
|
628 |
+
}
|
629 |
+
|
630 |
+
sub delete_weird_stuff {
|
631 |
+
local($caller, $s) = @_;
|
632 |
+
|
633 |
+
# delete control chacters (except tab and linefeed), zero-width characters, byte order mark,
|
634 |
+
# directional marks, join marks, variation selectors, Arabic tatweel
|
635 |
+
$s =~ s/([\x00-\x08\x0B-\x1F\x7F]|\xC2[\x80-\x9F]|\xD9\x80|\xE2\x80[\x8B-\x8F]|\xEF\xB8[\x80-\x8F]|\xEF\xBB\xBF|\xF3\xA0[\x84-\x87][\x80-\xBF])//g;
|
636 |
+
return $s;
|
637 |
+
}
|
638 |
+
|
639 |
+
sub number_of_utf8_character {
|
640 |
+
local($caller, $s) = @_;
|
641 |
+
|
642 |
+
$s2 = $s;
|
643 |
+
$s2 =~ s/[\x80-\xBF]//g;
|
644 |
+
return length($s2);
|
645 |
+
}
|
646 |
+
|
647 |
+
sub cap_letter_reg_exp {
|
648 |
+
# includes A-Z and other Latin-based capital letters with accents, umlauts and other decorations etc.
|
649 |
+
return "[A-Z]|\xC3[\x80-\x96\x98-\x9E]|\xC4[\x80\x82\x84\x86\x88\x8A\x8C\x8E\x90\x94\x964\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xAE\xB0\xB2\xB4\xB6\xB9\xBB\xBD\xBF]|\xC5[\x81\x83\x85\x87\x8A\x8C\x8E\x90\x92\x96\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xB0\xB2\xB4\xB6\xB8\xB9\xBB\xBD]";
|
650 |
+
}
|
651 |
+
|
652 |
+
sub regex_extended_case_expansion {
|
653 |
+
local($caller, $s) = @_;
|
654 |
+
|
655 |
+
if ($s =~ /\xC3/) {
|
656 |
+
$s =~ s/\xC3\xA0/\xC3\[\x80\xA0\]/g;
|
657 |
+
$s =~ s/\xC3\xA1/\xC3\[\x81\xA1\]/g;
|
658 |
+
$s =~ s/\xC3\xA2/\xC3\[\x82\xA2\]/g;
|
659 |
+
$s =~ s/\xC3\xA3/\xC3\[\x83\xA3\]/g;
|
660 |
+
$s =~ s/\xC3\xA4/\xC3\[\x84\xA4\]/g;
|
661 |
+
$s =~ s/\xC3\xA5/\xC3\[\x85\xA5\]/g;
|
662 |
+
$s =~ s/\xC3\xA6/\xC3\[\x86\xA6\]/g;
|
663 |
+
$s =~ s/\xC3\xA7/\xC3\[\x87\xA7\]/g;
|
664 |
+
$s =~ s/\xC3\xA8/\xC3\[\x88\xA8\]/g;
|
665 |
+
$s =~ s/\xC3\xA9/\xC3\[\x89\xA9\]/g;
|
666 |
+
$s =~ s/\xC3\xAA/\xC3\[\x8A\xAA\]/g;
|
667 |
+
$s =~ s/\xC3\xAB/\xC3\[\x8B\xAB\]/g;
|
668 |
+
$s =~ s/\xC3\xAC/\xC3\[\x8C\xAC\]/g;
|
669 |
+
$s =~ s/\xC3\xAD/\xC3\[\x8D\xAD\]/g;
|
670 |
+
$s =~ s/\xC3\xAE/\xC3\[\x8E\xAE\]/g;
|
671 |
+
$s =~ s/\xC3\xAF/\xC3\[\x8F\xAF\]/g;
|
672 |
+
$s =~ s/\xC3\xB0/\xC3\[\x90\xB0\]/g;
|
673 |
+
$s =~ s/\xC3\xB1/\xC3\[\x91\xB1\]/g;
|
674 |
+
$s =~ s/\xC3\xB2/\xC3\[\x92\xB2\]/g;
|
675 |
+
$s =~ s/\xC3\xB3/\xC3\[\x93\xB3\]/g;
|
676 |
+
$s =~ s/\xC3\xB4/\xC3\[\x94\xB4\]/g;
|
677 |
+
$s =~ s/\xC3\xB5/\xC3\[\x95\xB5\]/g;
|
678 |
+
$s =~ s/\xC3\xB6/\xC3\[\x96\xB6\]/g;
|
679 |
+
$s =~ s/\xC3\xB8/\xC3\[\x98\xB8\]/g;
|
680 |
+
$s =~ s/\xC3\xB9/\xC3\[\x99\xB9\]/g;
|
681 |
+
$s =~ s/\xC3\xBA/\xC3\[\x9A\xBA\]/g;
|
682 |
+
$s =~ s/\xC3\xBB/\xC3\[\x9B\xBB\]/g;
|
683 |
+
$s =~ s/\xC3\xBC/\xC3\[\x9C\xBC\]/g;
|
684 |
+
$s =~ s/\xC3\xBD/\xC3\[\x9D\xBD\]/g;
|
685 |
+
$s =~ s/\xC3\xBE/\xC3\[\x9E\xBE\]/g;
|
686 |
+
}
|
687 |
+
if ($s =~ /\xC5/) {
|
688 |
+
$s =~ s/\xC5\x91/\xC5\[\x90\x91\]/g;
|
689 |
+
$s =~ s/\xC5\xA1/\xC5\[\xA0\xA1\]/g;
|
690 |
+
$s =~ s/\xC5\xB1/\xC5\[\xB0\xB1\]/g;
|
691 |
+
}
|
692 |
+
|
693 |
+
return $s;
|
694 |
+
}
|
695 |
+
|
696 |
+
sub extended_lower_case {
|
697 |
+
local($caller, $s) = @_;
|
698 |
+
|
699 |
+
$s =~ tr/A-Z/a-z/;
|
700 |
+
|
701 |
+
# Latin-1
|
702 |
+
if ($s =~ /\xC3[\x80-\x9F]/) {
|
703 |
+
$s =~ s/À/à/g;
|
704 |
+
$s =~ s/Á/á/g;
|
705 |
+
$s =~ s/Â/â/g;
|
706 |
+
$s =~ s/Ã/ã/g;
|
707 |
+
$s =~ s/Ä/ä/g;
|
708 |
+
$s =~ s/Å/å/g;
|
709 |
+
$s =~ s/Æ/æ/g;
|
710 |
+
$s =~ s/Ç/ç/g;
|
711 |
+
$s =~ s/È/è/g;
|
712 |
+
$s =~ s/É/é/g;
|
713 |
+
$s =~ s/Ê/ê/g;
|
714 |
+
$s =~ s/Ë/ë/g;
|
715 |
+
$s =~ s/Ì/ì/g;
|
716 |
+
$s =~ s/Í/í/g;
|
717 |
+
$s =~ s/Î/î/g;
|
718 |
+
$s =~ s/Ï/ï/g;
|
719 |
+
$s =~ s/Ð/ð/g;
|
720 |
+
$s =~ s/Ñ/ñ/g;
|
721 |
+
$s =~ s/Ò/ò/g;
|
722 |
+
$s =~ s/Ó/ó/g;
|
723 |
+
$s =~ s/Ô/ô/g;
|
724 |
+
$s =~ s/Õ/õ/g;
|
725 |
+
$s =~ s/Ö/ö/g;
|
726 |
+
$s =~ s/Ø/ø/g;
|
727 |
+
$s =~ s/Ù/ù/g;
|
728 |
+
$s =~ s/Ú/ú/g;
|
729 |
+
$s =~ s/Û/û/g;
|
730 |
+
$s =~ s/Ü/ü/g;
|
731 |
+
$s =~ s/Ý/ý/g;
|
732 |
+
$s =~ s/Þ/þ/g;
|
733 |
+
}
|
734 |
+
# Latin Extended-A
|
735 |
+
if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
|
736 |
+
$s =~ s/Ā/ā/g;
|
737 |
+
$s =~ s/Ă/ă/g;
|
738 |
+
$s =~ s/Ą/ą/g;
|
739 |
+
$s =~ s/Ć/ć/g;
|
740 |
+
$s =~ s/Ĉ/ĉ/g;
|
741 |
+
$s =~ s/Ċ/ċ/g;
|
742 |
+
$s =~ s/Č/č/g;
|
743 |
+
$s =~ s/Ď/ď/g;
|
744 |
+
$s =~ s/Đ/đ/g;
|
745 |
+
$s =~ s/Ē/ē/g;
|
746 |
+
$s =~ s/Ĕ/ĕ/g;
|
747 |
+
$s =~ s/Ė/ė/g;
|
748 |
+
$s =~ s/Ę/ę/g;
|
749 |
+
$s =~ s/Ě/ě/g;
|
750 |
+
$s =~ s/Ĝ/ĝ/g;
|
751 |
+
$s =~ s/Ğ/ğ/g;
|
752 |
+
$s =~ s/Ġ/ġ/g;
|
753 |
+
$s =~ s/Ģ/ģ/g;
|
754 |
+
$s =~ s/Ĥ/ĥ/g;
|
755 |
+
$s =~ s/Ħ/ħ/g;
|
756 |
+
$s =~ s/Ĩ/ĩ/g;
|
757 |
+
$s =~ s/Ī/ī/g;
|
758 |
+
$s =~ s/Ĭ/ĭ/g;
|
759 |
+
$s =~ s/Į/į/g;
|
760 |
+
$s =~ s/İ/ı/g;
|
761 |
+
$s =~ s/IJ/ij/g;
|
762 |
+
$s =~ s/Ĵ/ĵ/g;
|
763 |
+
$s =~ s/Ķ/ķ/g;
|
764 |
+
$s =~ s/Ĺ/ĺ/g;
|
765 |
+
$s =~ s/Ļ/ļ/g;
|
766 |
+
$s =~ s/Ľ/ľ/g;
|
767 |
+
$s =~ s/Ŀ/ŀ/g;
|
768 |
+
$s =~ s/Ł/ł/g;
|
769 |
+
$s =~ s/Ń/ń/g;
|
770 |
+
$s =~ s/Ņ/ņ/g;
|
771 |
+
$s =~ s/Ň/ň/g;
|
772 |
+
$s =~ s/Ŋ/ŋ/g;
|
773 |
+
$s =~ s/Ō/ō/g;
|
774 |
+
$s =~ s/Ŏ/ŏ/g;
|
775 |
+
$s =~ s/Ő/ő/g;
|
776 |
+
$s =~ s/Œ/œ/g;
|
777 |
+
$s =~ s/Ŕ/ŕ/g;
|
778 |
+
$s =~ s/Ŗ/ŗ/g;
|
779 |
+
$s =~ s/Ř/ř/g;
|
780 |
+
$s =~ s/Ś/ś/g;
|
781 |
+
$s =~ s/Ŝ/ŝ/g;
|
782 |
+
$s =~ s/Ş/ş/g;
|
783 |
+
$s =~ s/Š/š/g;
|
784 |
+
$s =~ s/Ţ/ţ/g;
|
785 |
+
$s =~ s/Ť/ť/g;
|
786 |
+
$s =~ s/Ŧ/ŧ/g;
|
787 |
+
$s =~ s/Ũ/ũ/g;
|
788 |
+
$s =~ s/Ū/ū/g;
|
789 |
+
$s =~ s/Ŭ/ŭ/g;
|
790 |
+
$s =~ s/Ů/ů/g;
|
791 |
+
$s =~ s/Ű/ű/g;
|
792 |
+
$s =~ s/Ų/ų/g;
|
793 |
+
$s =~ s/Ŵ/ŵ/g;
|
794 |
+
$s =~ s/Ŷ/ŷ/g;
|
795 |
+
$s =~ s/Ź/ź/g;
|
796 |
+
$s =~ s/Ż/ż/g;
|
797 |
+
$s =~ s/Ž/ž/g;
|
798 |
+
}
|
799 |
+
# Greek letters
|
800 |
+
if ($s =~ /\xCE[\x86-\xAB]/) {
|
801 |
+
$s =~ s/Α/α/g;
|
802 |
+
$s =~ s/Β/β/g;
|
803 |
+
$s =~ s/Γ/γ/g;
|
804 |
+
$s =~ s/Δ/δ/g;
|
805 |
+
$s =~ s/Ε/ε/g;
|
806 |
+
$s =~ s/Ζ/ζ/g;
|
807 |
+
$s =~ s/Η/η/g;
|
808 |
+
$s =~ s/Θ/θ/g;
|
809 |
+
$s =~ s/Ι/ι/g;
|
810 |
+
$s =~ s/Κ/κ/g;
|
811 |
+
$s =~ s/Λ/λ/g;
|
812 |
+
$s =~ s/Μ/μ/g;
|
813 |
+
$s =~ s/Ν/ν/g;
|
814 |
+
$s =~ s/Ξ/ξ/g;
|
815 |
+
$s =~ s/Ο/ο/g;
|
816 |
+
$s =~ s/Π/π/g;
|
817 |
+
$s =~ s/Ρ/ρ/g;
|
818 |
+
$s =~ s/Σ/σ/g;
|
819 |
+
$s =~ s/Τ/τ/g;
|
820 |
+
$s =~ s/Υ/υ/g;
|
821 |
+
$s =~ s/Φ/φ/g;
|
822 |
+
$s =~ s/Χ/χ/g;
|
823 |
+
$s =~ s/Ψ/ψ/g;
|
824 |
+
$s =~ s/Ω/ω/g;
|
825 |
+
$s =~ s/Ϊ/ϊ/g;
|
826 |
+
$s =~ s/Ϋ/ϋ/g;
|
827 |
+
$s =~ s/Ά/ά/g;
|
828 |
+
$s =~ s/Έ/έ/g;
|
829 |
+
$s =~ s/Ή/ή/g;
|
830 |
+
$s =~ s/Ί/ί/g;
|
831 |
+
$s =~ s/Ό/ό/g;
|
832 |
+
$s =~ s/Ύ/ύ/g;
|
833 |
+
$s =~ s/Ώ/ώ/g;
|
834 |
+
}
|
835 |
+
# Cyrillic letters
|
836 |
+
if ($s =~ /\xD0[\x80-\xAF]/) {
|
837 |
+
$s =~ s/А/а/g;
|
838 |
+
$s =~ s/Б/б/g;
|
839 |
+
$s =~ s/В/в/g;
|
840 |
+
$s =~ s/Г/г/g;
|
841 |
+
$s =~ s/Д/д/g;
|
842 |
+
$s =~ s/Е/е/g;
|
843 |
+
$s =~ s/Ж/ж/g;
|
844 |
+
$s =~ s/З/з/g;
|
845 |
+
$s =~ s/И/и/g;
|
846 |
+
$s =~ s/Й/й/g;
|
847 |
+
$s =~ s/К/к/g;
|
848 |
+
$s =~ s/Л/л/g;
|
849 |
+
$s =~ s/М/м/g;
|
850 |
+
$s =~ s/Н/н/g;
|
851 |
+
$s =~ s/О/о/g;
|
852 |
+
$s =~ s/П/п/g;
|
853 |
+
$s =~ s/Р/р/g;
|
854 |
+
$s =~ s/С/с/g;
|
855 |
+
$s =~ s/Т/т/g;
|
856 |
+
$s =~ s/У/у/g;
|
857 |
+
$s =~ s/Ф/ф/g;
|
858 |
+
$s =~ s/Х/х/g;
|
859 |
+
$s =~ s/Ц/ц/g;
|
860 |
+
$s =~ s/Ч/ч/g;
|
861 |
+
$s =~ s/Ш/ш/g;
|
862 |
+
$s =~ s/Щ/щ/g;
|
863 |
+
$s =~ s/Ъ/ъ/g;
|
864 |
+
$s =~ s/Ы/ы/g;
|
865 |
+
$s =~ s/Ь/ь/g;
|
866 |
+
$s =~ s/Э/э/g;
|
867 |
+
$s =~ s/Ю/ю/g;
|
868 |
+
$s =~ s/Я/я/g;
|
869 |
+
$s =~ s/Ѐ/ѐ/g;
|
870 |
+
$s =~ s/Ё/ё/g;
|
871 |
+
$s =~ s/Ђ/ђ/g;
|
872 |
+
$s =~ s/Ѓ/ѓ/g;
|
873 |
+
$s =~ s/Є/є/g;
|
874 |
+
$s =~ s/Ѕ/ѕ/g;
|
875 |
+
$s =~ s/І/і/g;
|
876 |
+
$s =~ s/Ї/ї/g;
|
877 |
+
$s =~ s/Ј/ј/g;
|
878 |
+
$s =~ s/Љ/љ/g;
|
879 |
+
$s =~ s/Њ/њ/g;
|
880 |
+
$s =~ s/Ћ/ћ/g;
|
881 |
+
$s =~ s/Ќ/ќ/g;
|
882 |
+
$s =~ s/Ѝ/ѝ/g;
|
883 |
+
$s =~ s/Ў/ў/g;
|
884 |
+
$s =~ s/Џ/џ/g;
|
885 |
+
}
|
886 |
+
# Fullwidth A-Z
|
887 |
+
if ($s =~ /\xEF\xBC[\xA1-\xBA]/) {
|
888 |
+
$s =~ s/A/a/g;
|
889 |
+
$s =~ s/B/b/g;
|
890 |
+
$s =~ s/C/c/g;
|
891 |
+
$s =~ s/D/d/g;
|
892 |
+
$s =~ s/E/e/g;
|
893 |
+
$s =~ s/F/f/g;
|
894 |
+
$s =~ s/G/g/g;
|
895 |
+
$s =~ s/H/h/g;
|
896 |
+
$s =~ s/I/i/g;
|
897 |
+
$s =~ s/J/j/g;
|
898 |
+
$s =~ s/K/k/g;
|
899 |
+
$s =~ s/L/l/g;
|
900 |
+
$s =~ s/M/m/g;
|
901 |
+
$s =~ s/N/n/g;
|
902 |
+
$s =~ s/O/o/g;
|
903 |
+
$s =~ s/P/p/g;
|
904 |
+
$s =~ s/Q/q/g;
|
905 |
+
$s =~ s/R/r/g;
|
906 |
+
$s =~ s/S/s/g;
|
907 |
+
$s =~ s/T/t/g;
|
908 |
+
$s =~ s/U/u/g;
|
909 |
+
$s =~ s/V/v/g;
|
910 |
+
$s =~ s/W/w/g;
|
911 |
+
$s =~ s/X/x/g;
|
912 |
+
$s =~ s/Y/y/g;
|
913 |
+
$s =~ s/Z/z/g;
|
914 |
+
}
|
915 |
+
|
916 |
+
return $s;
|
917 |
+
}
|
918 |
+
|
919 |
+
sub extended_upper_case {
|
920 |
+
local($caller, $s) = @_;
|
921 |
+
|
922 |
+
$s =~ tr/a-z/A-Z/;
|
923 |
+
return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
|
924 |
+
|
925 |
+
$s =~ s/\xC3\xA0/\xC3\x80/g;
|
926 |
+
$s =~ s/\xC3\xA1/\xC3\x81/g;
|
927 |
+
$s =~ s/\xC3\xA2/\xC3\x82/g;
|
928 |
+
$s =~ s/\xC3\xA3/\xC3\x83/g;
|
929 |
+
$s =~ s/\xC3\xA4/\xC3\x84/g;
|
930 |
+
$s =~ s/\xC3\xA5/\xC3\x85/g;
|
931 |
+
$s =~ s/\xC3\xA6/\xC3\x86/g;
|
932 |
+
$s =~ s/\xC3\xA7/\xC3\x87/g;
|
933 |
+
$s =~ s/\xC3\xA8/\xC3\x88/g;
|
934 |
+
$s =~ s/\xC3\xA9/\xC3\x89/g;
|
935 |
+
$s =~ s/\xC3\xAA/\xC3\x8A/g;
|
936 |
+
$s =~ s/\xC3\xAB/\xC3\x8B/g;
|
937 |
+
$s =~ s/\xC3\xAC/\xC3\x8C/g;
|
938 |
+
$s =~ s/\xC3\xAD/\xC3\x8D/g;
|
939 |
+
$s =~ s/\xC3\xAE/\xC3\x8E/g;
|
940 |
+
$s =~ s/\xC3\xAF/\xC3\x8F/g;
|
941 |
+
$s =~ s/\xC3\xB0/\xC3\x90/g;
|
942 |
+
$s =~ s/\xC3\xB1/\xC3\x91/g;
|
943 |
+
$s =~ s/\xC3\xB2/\xC3\x92/g;
|
944 |
+
$s =~ s/\xC3\xB3/\xC3\x93/g;
|
945 |
+
$s =~ s/\xC3\xB4/\xC3\x94/g;
|
946 |
+
$s =~ s/\xC3\xB5/\xC3\x95/g;
|
947 |
+
$s =~ s/\xC3\xB6/\xC3\x96/g;
|
948 |
+
$s =~ s/\xC3\xB8/\xC3\x98/g;
|
949 |
+
$s =~ s/\xC3\xB9/\xC3\x99/g;
|
950 |
+
$s =~ s/\xC3\xBA/\xC3\x9A/g;
|
951 |
+
$s =~ s/\xC3\xBB/\xC3\x9B/g;
|
952 |
+
$s =~ s/\xC3\xBC/\xC3\x9C/g;
|
953 |
+
$s =~ s/\xC3\xBD/\xC3\x9D/g;
|
954 |
+
$s =~ s/\xC3\xBE/\xC3\x9E/g;
|
955 |
+
|
956 |
+
$s =~ s/\xC5\x91/\xC5\x90/g;
|
957 |
+
$s =~ s/\xC5\xA1/\xC5\xA0/g;
|
958 |
+
$s =~ s/\xC5\xB1/\xC5\xB0/g;
|
959 |
+
return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
|
960 |
+
|
961 |
+
return $s;
|
962 |
+
}
|
963 |
+
|
964 |
+
sub extended_first_upper_case {
|
965 |
+
local($caller, $s) = @_;
|
966 |
+
|
967 |
+
if (($first_char, $rest) = ($s =~ /^([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)) {
|
968 |
+
return $caller->extended_upper_case($first_char) . $rest;
|
969 |
+
} else {
|
970 |
+
return $s;
|
971 |
+
}
|
972 |
+
}
|
973 |
+
|
974 |
+
sub repair_doubly_converted_utf8_strings {
|
975 |
+
local($caller, $s) = @_;
|
976 |
+
|
977 |
+
if ($s =~ /\xC3[\x82-\x85]\xC2[\x80-\xBF]/) {
|
978 |
+
$s =~ s/\xC3\x82\xC2([\x80-\xBF])/\xC2$1/g;
|
979 |
+
$s =~ s/\xC3\x83\xC2([\x80-\xBF])/\xC3$1/g;
|
980 |
+
$s =~ s/\xC3\x84\xC2([\x80-\xBF])/\xC4$1/g;
|
981 |
+
$s =~ s/\xC3\x85\xC2([\x80-\xBF])/\xC5$1/g;
|
982 |
+
}
|
983 |
+
return $s;
|
984 |
+
}
|
985 |
+
|
986 |
+
sub repair_misconverted_windows_to_utf8_strings {
|
987 |
+
local($caller, $s) = @_;
|
988 |
+
|
989 |
+
# correcting conversions of UTF8 using Latin1-to-UTF converter
|
990 |
+
if ($s =~ /\xC3\xA2\xC2\x80\xC2[\x90-\xEF]/) {
|
991 |
+
my $result = "";
|
992 |
+
while (($pre,$last_c,$post) = ($s =~ /^(.*?)\xC3\xA2\xC2\x80\xC2([\x90-\xEF])(.*)$/s)) {
|
993 |
+
$result .= "$pre\xE2\x80$last_c";
|
994 |
+
$s = $post;
|
995 |
+
}
|
996 |
+
$result .= $s;
|
997 |
+
$s = $result;
|
998 |
+
}
|
999 |
+
# correcting conversions of Windows1252-to-UTF8 using Latin1-to-UTF converter
|
1000 |
+
if ($s =~ /\xC2[\x80-\x9F]/) {
|
1001 |
+
my $result = "";
|
1002 |
+
while (($pre,$c_windows,$post) = ($s =~ /^(.*?)\xC2([\x80-\x9F])(.*)$/s)) {
|
1003 |
+
$c_utf8 = $caller->windows1252_to_utf8($c_windows, 0);
|
1004 |
+
$result .= ($c_utf8 eq "?") ? ($pre . "\xC2" . $c_windows) : "$pre$c_utf8";
|
1005 |
+
$s = $post;
|
1006 |
+
}
|
1007 |
+
$result .= $s;
|
1008 |
+
$s = $result;
|
1009 |
+
}
|
1010 |
+
if ($s =~ /\xC3/) {
|
1011 |
+
$s =~ s/\xC3\xA2\xE2\x80\x9A\xC2\xAC/\xE2\x82\xAC/g; # x80 -> Euro sign
|
1012 |
+
# x81 codepoint undefined in Windows 1252
|
1013 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xA1/\xE2\x80\x9A/g; # x82 -> single low-9 quotation mark
|
1014 |
+
$s =~ s/\xC3\x86\xE2\x80\x99/\xC6\x92/g; # x83 -> Latin small letter f with hook
|
1015 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xBE/\xE2\x80\x9E/g; # x84 -> double low-9 quotation mark
|
1016 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA6/\xE2\x80\xA6/g; # x85 -> horizontal ellipsis
|
1017 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA0/\xE2\x80\xA0/g; # x86 -> dagger
|
1018 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA1/\xE2\x80\xA1/g; # x87 -> double dagger
|
1019 |
+
$s =~ s/\xC3\x8B\xE2\x80\xA0/\xCB\x86/g; # x88 -> modifier letter circumflex accent
|
1020 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB0/\xE2\x80\xB0/g; # x89 -> per mille sign
|
1021 |
+
$s =~ s/\xC3\x85\xC2\xA0/\xC5\xA0/g; # x8A -> Latin capital letter S with caron
|
1022 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB9/\xE2\x80\xB9/g; # x8B -> single left-pointing angle quotation mark
|
1023 |
+
$s =~ s/\xC3\x85\xE2\x80\x99/\xC5\x92/g; # x8C -> Latin capital ligature OE
|
1024 |
+
# x8D codepoint undefined in Windows 1252
|
1025 |
+
$s =~ s/\xC3\x85\xC2\xBD/\xC5\xBD/g; # x8E -> Latin capital letter Z with caron
|
1026 |
+
# x8F codepoint undefined in Windows 1252
|
1027 |
+
# x90 codepoint undefined in Windows 1252
|
1028 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xCB\x9C/\xE2\x80\x98/g; # x91 a-circumflex+euro+small tilde -> left single quotation mark
|
1029 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2/\xE2\x80\x99/g; # x92 a-circumflex+euro+trademark -> right single quotation mark
|
1030 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\x93/\xE2\x80\x9C/g; # x93 a-circumflex+euro+Latin small ligature oe -> left double quotation mark
|
1031 |
+
# x94 maps through undefined intermediate code point
|
1032 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA2/\xE2\x80\xA2/g; # x95 a-circumflex+euro+cent sign -> bullet
|
1033 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C/\xE2\x80\x93/g; # x96 a-circumflex+euro+left double quotation mark -> en dash
|
1034 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D/\xE2\x80\x94/g; # x97 a-circumflex+euro+right double quotation mark -> em dash
|
1035 |
+
$s =~ s/\xC3\x8B\xC5\x93/\xCB\x9C/g; # x98 Latin capital e diaeresis+Latin small ligature oe -> small tilde
|
1036 |
+
$s =~ s/\xC3\xA2\xE2\x80\x9E\xC2\xA2/\xE2\x84\xA2/g; # x99 -> trade mark sign
|
1037 |
+
$s =~ s/\xC3\x85\xC2\xA1/\xC5\xA1/g; # x9A -> Latin small letter s with caron
|
1038 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xBA/\xE2\x80\xBA/g; # x9B -> single right-pointing angle quotation mark
|
1039 |
+
$s =~ s/\xC3\x85\xE2\x80\x9C/\xC5\x93/g; # x9C -> Latin small ligature oe
|
1040 |
+
# x9D codepoint undefined in Windows 1252
|
1041 |
+
$s =~ s/\xC3\x85\xC2\xBE/\xC5\xBE/g; # x9E -> Latin small letter z with caron
|
1042 |
+
$s =~ s/\xC3\x85\xC2\xB8/\xC5\xB8/g; # x9F -> Latin capital letter Y with diaeresis
|
1043 |
+
$s =~ s/\xC3\xAF\xC2\xBF\xC2\xBD/\xEF\xBF\xBD/g; # replacement character
|
1044 |
+
}
|
1045 |
+
|
1046 |
+
return $s;
|
1047 |
+
}
|
1048 |
+
|
1049 |
+
sub latin1_to_utf {
|
1050 |
+
local($caller, $s) = @_;
|
1051 |
+
|
1052 |
+
my $result = "";
|
1053 |
+
while (($pre,$c,$post) = ($s =~ /^(.*?)([\x80-\xFF])(.*)$/s)) {
|
1054 |
+
$result .= $pre;
|
1055 |
+
if ($c =~ /^[\x80-\xBF]$/) {
|
1056 |
+
$result .= "\xC2$c";
|
1057 |
+
} elsif ($c =~ /^[\xC0-\xFF]$/) {
|
1058 |
+
$c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
1059 |
+
$result .= "\xC3$c";
|
1060 |
+
}
|
1061 |
+
$s = $post;
|
1062 |
+
}
|
1063 |
+
$result .= $s;
|
1064 |
+
return $result;
|
1065 |
+
}
|
1066 |
+
|
1067 |
+
sub character_type_is_letter_type {
|
1068 |
+
local($caller, $char_type) = @_;
|
1069 |
+
|
1070 |
+
return ($char_type =~ /\b((CJK|hiragana|kana|katakana)\s+character|diacritic|letter|syllable)\b/);
|
1071 |
+
}
|
1072 |
+
|
1073 |
+
sub character_type {
|
1074 |
+
local($caller, $c) = @_;
|
1075 |
+
|
1076 |
+
if ($c =~ /^[\x00-\x7F]/) {
|
1077 |
+
return "XML tag" if $c =~ /^<.*>$/;
|
1078 |
+
return "ASCII Latin letter" if $c =~ /^[a-z]$/i;
|
1079 |
+
return "ASCII digit" if $c =~ /^[0-9]$/i;
|
1080 |
+
return "ASCII whitespace" if $c =~ /^[\x09-\x0D\x20]$/;
|
1081 |
+
return "ASCII control-character" if $c =~ /^[\x00-\x1F\x7F]$/;
|
1082 |
+
return "ASCII currency" if $c eq "\$";
|
1083 |
+
return "ASCII punctuation";
|
1084 |
+
} elsif ($c =~ /^[\xC0-\xDF]/) {
|
1085 |
+
return "non-UTF8 (invalid)" unless $c =~ /^[\xC0-\xDF][\x80-\xBF]$/;
|
1086 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /[\xC0-\xC1]/;
|
1087 |
+
return "non-ASCII control-character" if $c =~ /\xC2[\x80-\x9F]/;
|
1088 |
+
return "non-ASCII whitespace" if $c =~ /\xC2\xA0/;
|
1089 |
+
return "non-ASCII currency" if $c =~ /\xC2[\xA2-\xA5]/;
|
1090 |
+
return "fraction" if $c =~ /\xC2[\xBC-\xBE]/; # NEW
|
1091 |
+
return "superscript digit" if $c =~ /\xC2[\xB2\xB3\xB9]/;
|
1092 |
+
return "non-ASCII Latin letter" if $c =~ /\xC2\xB5/; # micro sign
|
1093 |
+
return "non-ASCII punctuation" if $c =~ /\xC2[\xA0-\xBF]/;
|
1094 |
+
return "non-ASCII punctuation" if $c =~ /\xC3[\x97\xB7]/;
|
1095 |
+
return "non-ASCII Latin letter" if $c =~ /\xC3[\x80-\xBF]/;
|
1096 |
+
return "Latin ligature letter" if $c =~ /\xC4[\xB2\xB3]/;
|
1097 |
+
return "Latin ligature letter" if $c =~ /\xC5[\x92\x93]/;
|
1098 |
+
return "non-ASCII Latin letter" if $c =~ /[\xC4-\xC8]/;
|
1099 |
+
return "non-ASCII Latin letter" if $c =~ /\xC9[\x80-\x8F]/;
|
1100 |
+
return "IPA" if $c =~ /\xC9[\x90-\xBF]/;
|
1101 |
+
return "IPA" if $c =~ /\xCA[\x80-\xBF]/;
|
1102 |
+
return "IPA" if $c =~ /\xCB[\x80-\xBF]/;
|
1103 |
+
return "combining-diacritic" if $c =~ /\xCC[\x80-\xBF]/;
|
1104 |
+
return "combining-diacritic" if $c =~ /\xCD[\x80-\xAF]/;
|
1105 |
+
return "Greek punctuation" if $c =~ /\xCD[\xBE]/; # Greek question mark
|
1106 |
+
return "Greek punctuation" if $c =~ /\xCE[\x87]/; # Greek semicolon
|
1107 |
+
return "Greek letter" if $c =~ /\xCD[\xB0-\xBF]/;
|
1108 |
+
return "Greek letter" if $c =~ /\xCE/;
|
1109 |
+
return "Greek letter" if $c =~ /\xCF[\x80-\xA1\xB3\xB7\xB8\xBA\xBB]/;
|
1110 |
+
return "Coptic letter" if $c =~ /\xCF[\xA2-\xAF]/;
|
1111 |
+
return "Cyrillic letter" if $c =~ /[\xD0-\xD3]/;
|
1112 |
+
return "Cyrillic letter" if $c =~ /\xD4[\x80-\xAF]/;
|
1113 |
+
return "Armenian punctuation" if $c =~ /\xD5[\x9A-\x9F]/;
|
1114 |
+
return "Armenian punctuation" if $c =~ /\xD6[\x89-\x8F]/;
|
1115 |
+
return "Armenian letter" if $c =~ /\xD4[\xB0-\xBF]/;
|
1116 |
+
return "Armenian letter" if $c =~ /\xD5/;
|
1117 |
+
return "Armenian letter" if $c =~ /\xD6[\x80-\x8F]/;
|
1118 |
+
return "Hebrew accent" if $c =~ /\xD6[\x91-\xAE]/;
|
1119 |
+
return "Hebrew punctuation" if $c =~ /\xD6\xBE/;
|
1120 |
+
return "Hebrew punctuation" if $c =~ /\xD7[\x80\x83\x86\xB3\xB4]/;
|
1121 |
+
return "Hebrew point" if $c =~ /\xD6[\xB0-\xBF]/;
|
1122 |
+
return "Hebrew point" if $c =~ /\xD7[\x81\x82\x87]/;
|
1123 |
+
return "Hebrew letter" if $c =~ /\xD7[\x90-\xB2]/;
|
1124 |
+
return "other Hebrew" if $c =~ /\xD6[\x90-\xBF]/;
|
1125 |
+
return "other Hebrew" if $c =~ /\xD7/;
|
1126 |
+
return "Arabic currency" if $c =~ /\xD8\x8B/; # Afghani sign
|
1127 |
+
return "Arabic punctuation" if $c =~ /\xD8[\x89-\x8D\x9B\x9E\x9F]/;
|
1128 |
+
return "Arabic punctuation" if $c =~ /\xD9[\xAA-\xAD]/;
|
1129 |
+
return "Arabic punctuation" if $c =~ /\xDB[\x94]/;
|
1130 |
+
return "Arabic tatweel" if $c =~ /\xD9\x80/;
|
1131 |
+
return "Arabic letter" if $c =~ /\xD8[\xA0-\xBF]/;
|
1132 |
+
return "Arabic letter" if $c =~ /\xD9[\x81-\x9F]/;
|
1133 |
+
return "Arabic letter" if $c =~ /\xD9[\xAE-\xBF]/;
|
1134 |
+
return "Arabic letter" if $c =~ /\xDA[\x80-\xBF]/;
|
1135 |
+
return "Arabic letter" if $c =~ /\xDB[\x80-\x95]/;
|
1136 |
+
return "Arabic Indic digit" if $c =~ /\xD9[\xA0-\xA9]/;
|
1137 |
+
return "Arabic Indic digit" if $c =~ /\xDB[\xB0-\xB9]/;
|
1138 |
+
return "other Arabic" if $c =~ /[\xD8-\xDB]/;
|
1139 |
+
return "Syriac punctuation" if $c =~ /\xDC[\x80-\x8F]/;
|
1140 |
+
return "Syriac letter" if $c =~ /\xDC[\x90-\xAF]/;
|
1141 |
+
return "Syriac diacritic" if $c =~ /\xDC[\xB0-\xBF]/;
|
1142 |
+
return "Syriac diacritic" if $c =~ /\xDD[\x80-\x8A]/;
|
1143 |
+
return "Thaana letter" if $c =~ /\xDE/;
|
1144 |
+
} elsif ($c =~ /^[\xE0-\xEF]/) {
|
1145 |
+
return "non-UTF8 (invalid)" unless $c =~ /^[\xE0-\xEF][\x80-\xBF]{2,2}$/;
|
1146 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /\xE0[\x80-\x9F]/;
|
1147 |
+
return "Arabic letter" if $c =~ /\xE0\xA2[\xA0-\xBF]/; # extended letters
|
1148 |
+
return "other Arabic" if $c =~ /\xE0\xA3/; # extended characters
|
1149 |
+
return "Devanagari punctuation" if $c =~ /\xE0\xA5[\xA4\xA5]/; # danda, double danda
|
1150 |
+
return "Devanagari digit" if $c =~ /\xE0\xA5[\xA6-\xAF]/;
|
1151 |
+
return "Devanagari letter" if $c =~ /\xE0[\xA4-\xA5]/;
|
1152 |
+
return "Bengali digit" if $c =~ /\xE0\xA7[\xA6-\xAF]/;
|
1153 |
+
return "Bengali currency" if $c =~ /\xE0\xA7[\xB2-\xB9]/;
|
1154 |
+
return "Bengali letter" if $c =~ /\xE0[\xA6-\xA7]/;
|
1155 |
+
return "Gurmukhi digit" if $c =~ /\xE0\xA9[\xA6-\xAF]/;
|
1156 |
+
return "Gurmukhi letter" if $c =~ /\xE0[\xA8-\xA9]/;
|
1157 |
+
return "Gujarati digit" if $c =~ /\xE0\xAB[\xA6-\xAF]/;
|
1158 |
+
return "Gujarati letter" if $c =~ /\xE0[\xAA-\xAB]/;
|
1159 |
+
return "Oriya digit" if $c =~ /\xE0\xAD[\xA6-\xAF]/;
|
1160 |
+
return "Oriya fraction" if $c =~ /\xE0\xAD[\xB2-\xB7]/;
|
1161 |
+
return "Oriya letter" if $c =~ /\xE0[\xAC-\xAD]/;
|
1162 |
+
return "Tamil digit" if $c =~ /\xE0\xAF[\xA6-\xAF]/;
|
1163 |
+
return "Tamil number" if $c =~ /\xE0\xAF[\xB0-\xB2]/; # number (10, 100, 1000)
|
1164 |
+
return "Tamil letter" if $c =~ /\xE0[\xAE-\xAF]/;
|
1165 |
+
return "Telegu digit" if $c =~ /\xE0\xB1[\xA6-\xAF]/;
|
1166 |
+
return "Telegu fraction" if $c =~ /\xE0\xB1[\xB8-\xBE]/;
|
1167 |
+
return "Telegu letter" if $c =~ /\xE0[\xB0-\xB1]/;
|
1168 |
+
return "Kannada digit" if $c =~ /\xE0\xB3[\xA6-\xAF]/;
|
1169 |
+
return "Kannada letter" if $c =~ /\xE0[\xB2-\xB3]/;
|
1170 |
+
return "Malayalam digit" if $c =~ /\xE0\xB5[\x98-\x9E\xA6-\xB8]/;
|
1171 |
+
return "Malayalam punctuation" if $c =~ /\xE0\xB5\xB9/; # date mark
|
1172 |
+
return "Malayalam letter" if $c =~ /\xE0[\xB4-\xB5]/;
|
1173 |
+
return "Sinhala digit" if $c =~ /\xE0\xB7[\xA6-\xAF]/;
|
1174 |
+
return "Sinhala punctuation" if $c =~ /\xE0\xB7\xB4/;
|
1175 |
+
return "Sinhala letter" if $c =~ /\xE0[\xB6-\xB7]/;
|
1176 |
+
return "Thai currency" if $c =~ /\xE0\xB8\xBF/;
|
1177 |
+
return "Thai digit" if $c =~ /\xE0\xB9[\x90-\x99]/;
|
1178 |
+
return "Thai character" if $c =~ /\xE0[\xB8-\xB9]/;
|
1179 |
+
return "Lao punctuation" if $c =~ /\xE0\xBA\xAF/; # Lao ellipsis
|
1180 |
+
return "Lao digit" if $c =~ /\xE0\xBB[\x90-\x99]/;
|
1181 |
+
return "Lao character" if $c =~ /\xE0[\xBA-\xBB]/;
|
1182 |
+
return "Tibetan punctuation" if $c =~ /\xE0\xBC[\x81-\x94]/;
|
1183 |
+
return "Tibetan sign" if $c =~ /\xE0\xBC[\x95-\x9F]/;
|
1184 |
+
return "Tibetan digit" if $c =~ /\xE0\xBC[\xA0-\xB3]/;
|
1185 |
+
return "Tibetan punctuation" if $c =~ /\xE0\xBC[\xB4-\xBD]/;
|
1186 |
+
return "Tibetan letter" if $c =~ /\xE0[\xBC-\xBF]/;
|
1187 |
+
return "Myanmar digit" if $c =~ /\xE1\x81[\x80-\x89]/;
|
1188 |
+
return "Myanmar digit" if $c =~ /\xE1\x82[\x90-\x99]/; # Myanmar Shan digits
|
1189 |
+
return "Myanmar punctuation" if $c =~ /\xE1\x81[\x8A-\x8B]/;
|
1190 |
+
return "Myanmar letter" if $c =~ /\xE1[\x80-\x81]/;
|
1191 |
+
return "Myanmar letter" if $c =~ /\xE1\x82[\x80-\x9F]/;
|
1192 |
+
return "Georgian punctuation" if $c =~ /\xE1\x83\xBB/;
|
1193 |
+
return "Georgian letter" if $c =~ /\xE1\x82[\xA0-\xBF]/;
|
1194 |
+
return "Georgian letter" if $c =~ /\xE1\x83/;
|
1195 |
+
return "Georgian letter" if $c =~ /\xE1\xB2[\x90-\xBF]/; # Georgian Mtavruli capital letters
|
1196 |
+
return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; # Georgian small letters (Khutsuri)
|
1197 |
+
return "Korean Hangul letter" if $c =~ /\xE1[\x84-\x87]/;
|
1198 |
+
return "Ethiopic punctuation" if $c =~ /\xE1\x8D[\xA0-\xA8]/;
|
1199 |
+
return "Ethiopic digit" if $c =~ /\xE1\x8D[\xA9-\xB1]/;
|
1200 |
+
return "Ethiopic number" if $c =~ /\xE1\x8D[\xB2-\xBC]/;
|
1201 |
+
return "Ethiopic syllable" if $c =~ /\xE1[\x88-\x8D]/;
|
1202 |
+
return "Cherokee letter" if $c =~ /\xE1\x8E[\xA0-\xBF]/;
|
1203 |
+
return "Cherokee letter" if $c =~ /\xE1\x8F/;
|
1204 |
+
return "Canadian punctuation" if $c =~ /\xE1\x90\x80/; # Canadian Syllabics hyphen
|
1205 |
+
return "Canadian punctuation" if $c =~ /\xE1\x99\xAE/; # Canadian Syllabics full stop
|
1206 |
+
return "Canadian syllable" if $c =~ /\xE1[\x90-\x99]/;
|
1207 |
+
return "Canadian syllable" if $c =~ /\xE1\xA2[\xB0-\xBF]/;
|
1208 |
+
return "Canadian syllable" if $c =~ /\xE1\xA3/;
|
1209 |
+
return "Ogham whitespace" if $c =~ /\xE1\x9A\x80/;
|
1210 |
+
return "Ogham letter" if $c =~ /\xE1\x9A[\x81-\x9A]/;
|
1211 |
+
return "Ogham punctuation" if $c =~ /\xE1\x9A[\x9B-\x9C]/;
|
1212 |
+
return "Runic punctuation" if $c =~ /\xE1\x9B[\xAB-\xAD]/;
|
1213 |
+
return "Runic letter" if $c =~ /\xE1\x9A[\xA0-\xBF]/;
|
1214 |
+
return "Runic letter" if $c =~ /\xE1\x9B/;
|
1215 |
+
return "Khmer currency" if $c =~ /\xE1\x9F\x9B/;
|
1216 |
+
return "Khmer digit" if $c =~ /\xE1\x9F[\xA0-\xA9]/;
|
1217 |
+
return "Khmer letter" if $c =~ /\xE1[\x9E-\x9F]/;
|
1218 |
+
return "Mongolian punctuation" if $c =~ /\xE1\xA0[\x80-\x8A]/;
|
1219 |
+
return "Mongolian digit" if $c =~ /\xE1\xA0[\x90-\x99]/;
|
1220 |
+
return "Mongolian letter" if $c =~ /\xE1[\xA0-\xA1]/;
|
1221 |
+
return "Mongolian letter" if $c =~ /\xE1\xA2[\x80-\xAF]/;
|
1222 |
+
return "Buginese letter" if $c =~ /\xE1\xA8[\x80-\x9B]/;
|
1223 |
+
return "Buginese punctuation" if $c =~ /\xE1\xA8[\x9E-\x9F]/;
|
1224 |
+
return "Balinese letter" if $c =~ /\xE1\xAC/;
|
1225 |
+
return "Balinese letter" if $c =~ /\xE1\xAD[\x80-\x8F]/;
|
1226 |
+
return "Balinese digit" if $c =~ /\xE1\xAD[\x90-\x99]/;
|
1227 |
+
return "Balinese puncutation" if $c =~ /\xE1\xAD[\x9A-\xA0]/;
|
1228 |
+
return "Balinese symbol" if $c =~ /\xE1\xAD[\xA1-\xBF]/;
|
1229 |
+
return "Sundanese digit" if $c =~ /\xE1\xAE[\xB0-\xB9]/;
|
1230 |
+
return "Sundanese letter" if $c =~ /\xE1\xAE/;
|
1231 |
+
return "Cyrillic letter" if $c =~ /\xE1\xB2[\x80-\x8F]/;
|
1232 |
+
return "Sundanese punctuation" if $c =~ /\xE1\xB3[\x80-\x8F]/;
|
1233 |
+
return "IPA" if $c =~ /\xE1[\xB4-\xB6]/;
|
1234 |
+
return "non-ASCII Latin letter" if $c =~ /\xE1[\xB8-\xBB]/;
|
1235 |
+
return "Greek letter" if $c =~ /\xE1[\xBC-\xBF]/;
|
1236 |
+
return "non-ASCII whitespace" if $c =~ /\xE2\x80[\x80-\x8A\xAF]/;
|
1237 |
+
return "zero-width space" if $c =~ /\xE2\x80\x8B/;
|
1238 |
+
return "zero-width non-space" if $c =~ /\xE2\x80\x8C/;
|
1239 |
+
return "zero-width joiner" if $c =~ /\xE2\x80\x8D/;
|
1240 |
+
return "directional mark" if $c =~ /\xE2\x80[\x8E-\x8F\xAA-\xAE]/;
|
1241 |
+
return "non-ASCII punctuation" if $c =~ /\xE2\x80[\x90-\xBF]/;
|
1242 |
+
return "non-ASCII punctuation" if $c =~ /\xE2\x81[\x80-\x9E]/;
|
1243 |
+
return "superscript letter" if $c =~ /\xE2\x81[\xB1\xBF]/;
|
1244 |
+
return "superscript digit" if $c =~ /\xE2\x81[\xB0-\xB9]/;
|
1245 |
+
return "superscript punctuation" if $c =~ /\xE2\x81[\xBA-\xBE]/;
|
1246 |
+
return "subscript digit" if $c =~ /\xE2\x82[\x80-\x89]/;
|
1247 |
+
return "subscript punctuation" if $c =~ /\xE2\x82[\x8A-\x8E]/;
|
1248 |
+
return "non-ASCII currency" if $c =~ /\xE2\x82[\xA0-\xBF]/;
|
1249 |
+
return "letterlike symbol" if $c =~ /\xE2\x84/;
|
1250 |
+
return "letterlike symbol" if $c =~ /\xE2\x85[\x80-\x8F]/;
|
1251 |
+
return "fraction" if $c =~ /\xE2\x85[\x90-\x9E]/; # NEW
|
1252 |
+
return "Roman number" if $c =~ /\xE2\x85[\xA0-\xBF]/; # NEW
|
1253 |
+
return "arrow symbol" if $c =~ /\xE2\x86[\x90-\xBF]/;
|
1254 |
+
return "arrow symbol" if $c =~ /\xE2\x87/;
|
1255 |
+
return "mathematical operator" if $c =~ /\xE2[\x88-\x8B]/;
|
1256 |
+
return "technical symbol" if $c =~ /\xE2[\x8C-\x8F]/;
|
1257 |
+
return "enclosed alphanumeric" if $c =~ /\xE2\x91[\xA0-\xBF]/;
|
1258 |
+
return "enclosed alphanumeric" if $c =~ /\xE2[\x92-\x93]/;
|
1259 |
+
return "box drawing" if $c =~ /\xE2[\x94-\x95]/;
|
1260 |
+
return "geometric shape" if $c =~ /\xE2\x96[\xA0-\xBF]/;
|
1261 |
+
return "geometric shape" if $c =~ /\xE2\x97/;
|
1262 |
+
return "pictograph" if $c =~ /\xE2[\x98-\x9E]/;
|
1263 |
+
return "arrow symbol" if $c =~ /\xE2\xAC[\x80-\x91\xB0-\xBF]/;
|
1264 |
+
return "geometric shape" if $c =~ /\xE2\xAC[\x92-\xAF]/;
|
1265 |
+
return "arrow symbol" if $c =~ /\xE2\xAD[\x80-\x8F\x9A-\xBF]/;
|
1266 |
+
return "geometric shape" if $c =~ /\xE2\xAD[\x90-\x99]/;
|
1267 |
+
return "arrow symbol" if $c =~ /\xE2\xAE[\x80-\xB9]/;
|
1268 |
+
return "geometric shape" if $c =~ /\xE2\xAE[\xBA-\xBF]/;
|
1269 |
+
return "geometric shape" if $c =~ /\xE2\xAF[\x80-\x88\x8A-\x8F]/;
|
1270 |
+
return "symbol" if $c =~ /\xE2[\xAC-\xAF]/;
|
1271 |
+
return "Coptic fraction" if $c =~ /\xE2\xB3\xBD/;
|
1272 |
+
return "Coptic punctuation" if $c =~ /\xE2\xB3[\xB9-\xBF]/;
|
1273 |
+
return "Coptic letter" if $c =~ /\xE2[\xB2-\xB3]/;
|
1274 |
+
return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/;
|
1275 |
+
return "Tifinagh punctuation" if $c =~ /\xE2\xB5\xB0/;
|
1276 |
+
return "Tifinagh letter" if $c =~ /\xE2\xB4[\xB0-\xBF]/;
|
1277 |
+
return "Tifinagh letter" if $c =~ /\xE2\xB5/;
|
1278 |
+
return "Ethiopic syllable" if $c =~ /\xE2\xB6/;
|
1279 |
+
return "Ethiopic syllable" if $c =~ /\xE2\xB7[\x80-\x9F]/;
|
1280 |
+
return "non-ASCII punctuation" if $c =~ /\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]/;
|
1281 |
+
return "symbol" if $c =~ /\xE3\x80[\x91\x92\xA0\xB6\xB7]/;
|
1282 |
+
return "Japanese hiragana character" if $c =~ /\xE3\x81/;
|
1283 |
+
return "Japanese hiragana character" if $c =~ /\xE3\x82[\x80-\x9F]/;
|
1284 |
+
return "Japanese katakana character" if $c =~ /\xE3\x82[\xA0-\xBF]/;
|
1285 |
+
return "Japanese katakana character" if $c =~ /\xE3\x83/;
|
1286 |
+
return "Bopomofo letter" if $c =~ /\xE3\x84[\x80-\xAF]/;
|
1287 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x84[\xB0-\xBF]/;
|
1288 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x85/;
|
1289 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x86[\x80-\x8F]/;
|
1290 |
+
return "Bopomofo letter" if $c =~ /\xE3\x86[\xA0-\xBF]/;
|
1291 |
+
return "CJK stroke" if $c =~ /\xE3\x87[\x80-\xAF]/;
|
1292 |
+
return "Japanese kana character" if $c =~ /\xE3\x87[\xB0-\xBF]/;
|
1293 |
+
return "CJK symbol" if $c =~ /\xE3[\x88-\x8B]/;
|
1294 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8D[\xB1-\xBA]/;
|
1295 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8E/;
|
1296 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8F[\x80-\x9F\xBF]/;
|
1297 |
+
return "CJK character" if $c =~ /\xE4[\xB8-\xBF]/;
|
1298 |
+
return "CJK character" if $c =~ /[\xE5-\xE9]/;
|
1299 |
+
return "Yi syllable" if $c =~ /\xEA[\x80-\x92]/;
|
1300 |
+
return "Lisu letter" if $c =~ /\xEA\x93[\x90-\xBD]/;
|
1301 |
+
return "Lisu punctuation" if $c =~ /\xEA\x93[\xBE-\xBF]/;
|
1302 |
+
return "Cyrillic letter" if $c =~ /\xEA\x99/;
|
1303 |
+
return "Cyrillic letter" if $c =~ /\xEA\x9A[\x80-\x9F]/;
|
1304 |
+
return "modifier tone" if $c =~ /\xEA\x9C[\x80-\xA1]/;
|
1305 |
+
return "Javanese punctuation" if $c =~ /\xEA\xA7[\x81-\x8D\x9E-\x9F]/;
|
1306 |
+
return "Javanese digit" if $c =~ /\xEA\xA7[\x90-\x99]/;
|
1307 |
+
return "Javanese letter" if $c =~ /\xEA\xA6/;
|
1308 |
+
return "Javanese letter" if $c =~ /\xEA\xA7[\x80-\x9F]/;
|
1309 |
+
return "Ethiopic syllable" if $c =~ /\xEA\xAC[\x80-\xAF]/;
|
1310 |
+
return "Cherokee letter" if $c =~ /\xEA\xAD[\xB0-\xBF]/;
|
1311 |
+
return "Cherokee letter" if $c =~ /\xEA\xAE/;
|
1312 |
+
return "Meetai Mayek digit" if $c =~ /\xEA\xAF[\xB0-\xB9]/;
|
1313 |
+
return "Meetai Mayek letter" if $c =~ /\xEA\xAF/;
|
1314 |
+
return "Korean Hangul syllable" if $c =~ /\xEA[\xB0-\xBF]/;
|
1315 |
+
return "Korean Hangul syllable" if $c =~ /[\xEB-\xEC]/;
|
1316 |
+
return "Korean Hangul syllable" if $c =~ /\xED[\x80-\x9E]/;
|
1317 |
+
return "Klingon letter" if $c =~ /\xEF\xA3[\x90-\xA9]/;
|
1318 |
+
return "Klingon digit" if $c =~ /\xEF\xA3[\xB0-\xB9]/;
|
1319 |
+
return "Klingon punctuation" if $c =~ /\xEF\xA3[\xBD-\xBE]/;
|
1320 |
+
return "Klingon symbol" if $c =~ /\xEF\xA3\xBF/;
|
1321 |
+
return "private use character" if $c =~ /\xEE/;
|
1322 |
+
return "Latin typographic ligature" if $c =~ /\xEF\xAC[\x80-\x86]/;
|
1323 |
+
return "Hebrew presentation letter" if $c =~ /\xEF\xAC[\x9D-\xBF]/;
|
1324 |
+
return "Hebrew presentation letter" if $c =~ /\xEF\xAD[\x80-\x8F]/;
|
1325 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xAD[\x90-\xBF]/;
|
1326 |
+
return "Arabic presentation letter" if $c =~ /\xEF[\xAE-\xB7]/;
|
1327 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\x90-\x99]/;
|
1328 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\xB0-\xBF]/;
|
1329 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB9[\x80-\xAB]/;
|
1330 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xB9[\xB0-\xBF]/;
|
1331 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xBA/;
|
1332 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xBB[\x80-\xBC]/;
|
1333 |
+
return "byte-order mark/zero-width no-break space" if $c eq "\xEF\xBB\xBF";
|
1334 |
+
return "fullwidth currency" if $c =~ /\xEF\xBC\x84/;
|
1335 |
+
return "fullwidth digit" if $c =~ /\xEF\xBC[\x90-\x99]/;
|
1336 |
+
return "fullwidth Latin letter" if $c =~ /\xEF\xBC[\xA1-\xBA]/;
|
1337 |
+
return "fullwidth Latin letter" if $c =~ /\xEF\xBD[\x81-\x9A]/;
|
1338 |
+
return "fullwidth punctuation" if $c =~ /\xEF\xBC/;
|
1339 |
+
return "fullwidth punctuation" if $c =~ /\xEF\xBD[\x9B-\xA4]/;
|
1340 |
+
return "halfwidth Japanese punctuation" if $c =~ /\xEF\xBD[\xA1-\xA4]/;
|
1341 |
+
return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBD[\xA5-\xBF]/;
|
1342 |
+
return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBE[\x80-\x9F]/;
|
1343 |
+
return "fullwidth currency" if $c =~ /\xEF\xBF[\xA0-\xA6]/;
|
1344 |
+
return "replacement character" if $c eq "\xEF\xBF\xBD";
|
1345 |
+
} elsif ($c =~ /[\xF0-\xF7]/) {
|
1346 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xF0-\xF7][\x80-\xBF]{3,3}$/;
|
1347 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /\xF0[\x80-\x8F]/;
|
1348 |
+
return "Linear B syllable" if $c =~ /\xF0\x90\x80/;
|
1349 |
+
return "Linear B syllable" if $c =~ /\xF0\x90\x81[\x80-\x8F]/;
|
1350 |
+
return "Linear B symbol" if $c =~ /\xF0\x90\x81[\x90-\x9F]/;
|
1351 |
+
return "Linear B ideogram" if $c =~ /\xF0\x90[\x82-\x83]/;
|
1352 |
+
return "Gothic letter" if $c =~ /\xF0\x90\x8C[\xB0-\xBF]/;
|
1353 |
+
return "Gothic letter" if $c =~ /\xF0\x90\x8D[\x80-\x8F]/;
|
1354 |
+
return "Phoenician letter" if $c =~ /\xF0\x90\xA4[\x80-\x95]/;
|
1355 |
+
return "Phoenician number" if $c =~ /\xF0\x90\xA4[\x96-\x9B]/;
|
1356 |
+
return "Phoenician punctuation" if $c =~ /\xF0\x90\xA4\x9F/; # word separator
|
1357 |
+
return "Old Hungarian number" if $c =~ /\xF0\x90\xB3[\xBA-\xBF]/;
|
1358 |
+
return "Old Hungarian letter" if $c =~ /\xF0\x90[\xB2-\xB3]/;
|
1359 |
+
return "Cuneiform digit" if $c =~ /\xF0\x92\x90/; # numberic sign
|
1360 |
+
return "Cuneiform digit" if $c =~ /\xF0\x92\x91[\x80-\xAF]/; # numberic sign
|
1361 |
+
return "Cuneiform punctuation" if $c =~ /\xF0\x92\x91[\xB0-\xBF]/;
|
1362 |
+
return "Cuneiform sign" if $c =~ /\xF0\x92[\x80-\x95]/;
|
1363 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x81\xA8/;
|
1364 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x82[\xAD-\xB6]/;
|
1365 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x86[\x90\xBC-\xBF]/;
|
1366 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x87[\x80-\x84]/;
|
1367 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8D[\xA2-\xAB]/;
|
1368 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8E[\x86-\x92]/;
|
1369 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8F[\xBA-\xBF]/;
|
1370 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x90[\x80-\x83]/;
|
1371 |
+
return "Egyptian hieroglyph" if $c =~ /\xF0\x93[\x80-\x90]/;
|
1372 |
+
return "enclosed alphanumeric" if $c =~ /\xF0\x9F[\x84-\x87]/;
|
1373 |
+
return "Mahjong symbol" if $c =~ /\xF0\x9F\x80[\x80-\xAF]/;
|
1374 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x80[\xB0-\xBF]/;
|
1375 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x81/;
|
1376 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x82[\x80-\x9F]/;
|
1377 |
+
return "Playing card symbol" if $c =~ /\xF0\x9F\x82[\xA0-\xBF]/;
|
1378 |
+
return "Playing card symbol" if $c =~ /\xF0\x9F\x83/;
|
1379 |
+
return "CJK symbol" if $c =~ /\xF0\x9F[\x88-\x8B]/;
|
1380 |
+
return "pictograph" if $c =~ /\xF0\x9F[\x8C-\x9B]/;
|
1381 |
+
return "geometric shape" if $c =~ /\xF0\x9F[\x9E-\x9F]/;
|
1382 |
+
return "non-ASCII punctuation" if $c =~ /\xF0\x9F[\xA0-\xA3]/;
|
1383 |
+
return "pictograph" if $c =~ /\xF0\x9F[\xA4-\xAB]/;
|
1384 |
+
return "CJK character" if $c =~ /\xF0[\xA0-\xAF]/;
|
1385 |
+
return "tag" if $c =~ /\xF3\xA0[\x80-\x81]/;
|
1386 |
+
return "variation selector" if $c =~ /\xF3\xA0[\x84-\x87]/;
|
1387 |
+
return "private use character" if $c =~ /\xF3[\xB0-\xBF]/;
|
1388 |
+
return "private use character" if $c =~ /\xF4[\x80-\x8F]/;
|
1389 |
+
# ...
|
1390 |
+
} elsif ($c =~ /[\xF8-\xFB]/) {
|
1391 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xF8-\xFB][\x80-\xBF]{4,4}$/;
|
1392 |
+
} elsif ($c =~ /[\xFC-\xFD]/) {
|
1393 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xFC-\xFD][\x80-\xBF]{5,5}$/;
|
1394 |
+
} elsif ($c =~ /\xFE/) {
|
1395 |
+
return "non-UTF8 (invalid)" unless $c =~ /\xFE][\x80-\xBF]{6,6}$/;
|
1396 |
+
} else {
|
1397 |
+
return "non-UTF8 (invalid)";
|
1398 |
+
}
|
1399 |
+
return "other character";
|
1400 |
+
}
|
1401 |
+
|
1402 |
+
1;
|
1403 |
+
|
1404 |
+
|
uroman/lib/NLP/stringDistance.pm
ADDED
@@ -0,0 +1,724 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# stringDistance #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::stringDistance;
|
8 |
+
|
9 |
+
use List::Util qw(min max);
|
10 |
+
$utf8 = NLP::UTF8;
|
11 |
+
$util = NLP::utilities;
|
12 |
+
$romanizer = NLP::Romanizer;
|
13 |
+
|
14 |
+
%dummy_ht = ();
|
15 |
+
|
16 |
+
sub rule_string_expansion {
|
17 |
+
local($this, *ht, $s, $lang_code) = @_;
|
18 |
+
|
19 |
+
my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
|
20 |
+
foreach $sub_len ((0 .. ($#characters-1))) {
|
21 |
+
my $sub = join("", @characters[0 .. $sub_len]);
|
22 |
+
foreach $super_len ((($sub_len + 1) .. $#characters)) {
|
23 |
+
my $super = join("", @characters[0 .. $super_len]);
|
24 |
+
# print STDERR " $sub -> $super\n" unless $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super};
|
25 |
+
$ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super} = 1;
|
26 |
+
$ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$sub} = 1;
|
27 |
+
# print STDERR " RULE_STRING_HAS_EXPANSION $lang_code $sub\n";
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
|
32 |
+
sub load_string_distance_data {
|
33 |
+
local($this, $filename, *ht, $verbose) = @_;
|
34 |
+
|
35 |
+
$verbose = 0 unless defined($verbose);
|
36 |
+
open(IN,$filename) || die "Could not open $filename";
|
37 |
+
my $line_number = 0;
|
38 |
+
my $n_cost_rules = 0;
|
39 |
+
while (<IN>) {
|
40 |
+
$line_number++;
|
41 |
+
my $line = $_;
|
42 |
+
$line =~ s/^\xEF\xBB\xBF//;
|
43 |
+
$line =~ s/\s*$//;
|
44 |
+
next if $line =~ /^\s*(\#.*)?$/;
|
45 |
+
print STDERR "** Warning: line $line_number contains suspicious control character: $line\n" if $line =~ /[\x00-\x1F]/;
|
46 |
+
my $s1 = $util->slot_value_in_double_colon_del_list($line, "s1");
|
47 |
+
my $s2 = $util->slot_value_in_double_colon_del_list($line, "s2");
|
48 |
+
$s1 = $util->dequote_string($s1); # 'can\'t' => can't
|
49 |
+
$s2 = $util->dequote_string($s2);
|
50 |
+
my $cost = $util->slot_value_in_double_colon_del_list($line, "cost");
|
51 |
+
if (($s1 eq "") && ($s2 eq "")) {
|
52 |
+
print STDERR "Ignoring bad line $line_number in $filename, because both s1 and s2 are empty strings\n";
|
53 |
+
next;
|
54 |
+
}
|
55 |
+
unless ($cost =~ /^\d+(\.\d+)?$/) {
|
56 |
+
if ($cost eq "") {
|
57 |
+
print STDERR "Ignoring bad line $line_number in $filename, because of missing cost\n";
|
58 |
+
} else {
|
59 |
+
print STDERR "Ignoring bad line $line_number in $filename, because of ill-formed cost $cost\n";
|
60 |
+
}
|
61 |
+
next;
|
62 |
+
}
|
63 |
+
my $lang_code1_s = $util->slot_value_in_double_colon_del_list($line, "lc1");
|
64 |
+
my $lang_code2_s = $util->slot_value_in_double_colon_del_list($line, "lc2");
|
65 |
+
my @lang_codes_1 = ($lang_code1_s eq "") ? ("") : split(/,\s*/, $lang_code1_s);
|
66 |
+
my @lang_codes_2 = ($lang_code2_s eq "") ? ("") : split(/,\s*/, $lang_code2_s);
|
67 |
+
my $left_context1 = $util->slot_value_in_double_colon_del_list($line, "left1");
|
68 |
+
my $left_context2 = $util->slot_value_in_double_colon_del_list($line, "left2");
|
69 |
+
my $right_context1 = $util->slot_value_in_double_colon_del_list($line, "right1");
|
70 |
+
my $right_context2 = $util->slot_value_in_double_colon_del_list($line, "right2");
|
71 |
+
my $bad_left = $util->slot_value_in_double_colon_del_list($line, "left");
|
72 |
+
if ($bad_left) {
|
73 |
+
print STDERR "** Warning: slot '::left $bad_left' in line $line_number\n";
|
74 |
+
next;
|
75 |
+
}
|
76 |
+
my $bad_right = $util->slot_value_in_double_colon_del_list($line, "right");
|
77 |
+
if ($bad_right) {
|
78 |
+
print STDERR "** Warning: slot '::right $bad_right' in line $line_number\n";
|
79 |
+
next;
|
80 |
+
}
|
81 |
+
my $in_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "in-lc1");
|
82 |
+
my $in_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "in-lc2");
|
83 |
+
my $out_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "out-lc1");
|
84 |
+
my $out_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "out-lc2");
|
85 |
+
if ($left_context1) {
|
86 |
+
if ($left_context1 =~ /^\/.*\/$/) {
|
87 |
+
$left_context1 =~ s/^\///;
|
88 |
+
$left_context1 =~ s/\/$//;
|
89 |
+
} else {
|
90 |
+
print STDERR "Ignoring unrecognized non-regular-express ::left1 $left_context1 in $line_number of $filename\n";
|
91 |
+
$left_context1 = "";
|
92 |
+
}
|
93 |
+
}
|
94 |
+
if ($left_context2) {
|
95 |
+
if ($left_context2 =~ /^\/.*\/$/) {
|
96 |
+
$left_context2 =~ s/^\///;
|
97 |
+
$left_context2 =~ s/\/$//;
|
98 |
+
} else {
|
99 |
+
$left_context2 = "";
|
100 |
+
print STDERR "Ignoring unrecognized non-regular-express ::left2 $left_context2 in $line_number of $filename\n";
|
101 |
+
}
|
102 |
+
}
|
103 |
+
if ($right_context1) {
|
104 |
+
unless ($right_context1 =~ /^(\[[^\[\]]*\])+$/) {
|
105 |
+
$right_context1 = "";
|
106 |
+
print STDERR "Ignoring unrecognized right-context ::right1 $right_context1 in $line_number of $filename\n";
|
107 |
+
}
|
108 |
+
}
|
109 |
+
if ($right_context2) {
|
110 |
+
unless ($right_context2 =~ /^(\[[^\[\]]*\])+$/) {
|
111 |
+
$right_context2 = "";
|
112 |
+
print STDERR "Ignoring unrecognized right-context ::right2 $right_context2 in $line_number of $filename\n";
|
113 |
+
}
|
114 |
+
}
|
115 |
+
foreach $lang_code1 (@lang_codes_1) {
|
116 |
+
foreach $lang_code2 (@lang_codes_2) {
|
117 |
+
$n_cost_rules++;
|
118 |
+
my $cost_rule_id = $n_cost_rules;
|
119 |
+
$ht{COST}->{$lang_code1}->{$lang_code2}->{$s1}->{$s2}->{$cost_rule_id} = $cost;
|
120 |
+
$ht{RULE_STRING}->{$lang_code1}->{$s1} = 1;
|
121 |
+
$ht{RULE_STRING}->{$lang_code2}->{$s2} = 1;
|
122 |
+
$ht{LEFT1}->{$cost_rule_id} = $left_context1;
|
123 |
+
$ht{LEFT2}->{$cost_rule_id} = $left_context2;
|
124 |
+
$ht{RIGHT1}->{$cost_rule_id} = $right_context1;
|
125 |
+
$ht{RIGHT2}->{$cost_rule_id} = $right_context2;
|
126 |
+
$ht{INLC1}->{$cost_rule_id} = $in_lang_codes1;
|
127 |
+
$ht{INLC2}->{$cost_rule_id} = $in_lang_codes2;
|
128 |
+
$ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes1;
|
129 |
+
$ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes2;
|
130 |
+
unless (($s1 eq $s2)
|
131 |
+
&& ($lang_code1 eq $lang_code2)
|
132 |
+
&& ($left_context1 eq $left_context2)
|
133 |
+
&& ($right_context1 eq $right_context2)
|
134 |
+
&& ($in_lang_codes1 eq $in_lang_codes2)
|
135 |
+
&& ($out_lang_codes1 eq $out_lang_codes2)) {
|
136 |
+
$n_cost_rules++;
|
137 |
+
$cost_rule_id = $n_cost_rules;
|
138 |
+
$ht{COST}->{$lang_code2}->{$lang_code1}->{$s2}->{$s1}->{$cost_rule_id} = $cost;
|
139 |
+
$ht{LEFT1}->{$cost_rule_id} = $left_context2;
|
140 |
+
$ht{LEFT2}->{$cost_rule_id} = $left_context1;
|
141 |
+
$ht{RIGHT1}->{$cost_rule_id} = $right_context2;
|
142 |
+
$ht{RIGHT2}->{$cost_rule_id} = $right_context1;
|
143 |
+
$ht{INLC1}->{$cost_rule_id} = $in_lang_codes2;
|
144 |
+
$ht{INLC2}->{$cost_rule_id} = $in_lang_codes1;
|
145 |
+
$ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes2;
|
146 |
+
$ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes1;
|
147 |
+
# print STDERR " Flip rule in line $line: $line\n";
|
148 |
+
}
|
149 |
+
$this->rule_string_expansion(*ht, $s1, $lang_code1);
|
150 |
+
$this->rule_string_expansion(*ht, $s2, $lang_code2);
|
151 |
+
}
|
152 |
+
}
|
153 |
+
}
|
154 |
+
close(IN);
|
155 |
+
print STDERR "Read in $n_cost_rules rules from $line_number lines in $filename\n" if $verbose;
|
156 |
+
}
|
157 |
+
|
158 |
+
sub romanized_string_to_simple_chart {
|
159 |
+
local($this, $s, *chart_ht) = @_;
|
160 |
+
|
161 |
+
my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
|
162 |
+
$chart_ht{N_CHARS} = $#characters + 1;
|
163 |
+
$chart_ht{N_NODES} = 0;
|
164 |
+
foreach $i ((0 .. $#characters)) {
|
165 |
+
$romanizer->add_node($characters[$i], $i, ($i+1), *chart_ht, "", "");
|
166 |
+
}
|
167 |
+
}
|
168 |
+
|
169 |
+
sub linearize_chart_points {
|
170 |
+
local($this, *chart_ht, $chart_id, *sd_ht, $verbose) = @_;
|
171 |
+
|
172 |
+
$verbose = 0 unless defined($verbose);
|
173 |
+
print STDERR "Linearize $chart_id\n" if $verbose;
|
174 |
+
my $current_chart_pos = 0;
|
175 |
+
my $current_linear_chart_pos = 0;
|
176 |
+
$sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
|
177 |
+
$sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
|
178 |
+
print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
|
179 |
+
my @end_chart_positions = keys %{$chart_ht{NODES_ENDING_AT}};
|
180 |
+
my $end_chart_pos = (@end_chart_positions) ? max(@end_chart_positions) : 0;
|
181 |
+
$sd_ht{MAXPOS}->{$chart_id} = $end_chart_pos;
|
182 |
+
print STDERR " Chart span: $current_chart_pos-$end_chart_pos\n" if $verbose;
|
183 |
+
while ($current_chart_pos < $end_chart_pos) {
|
184 |
+
my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
|
185 |
+
foreach $node_id (@node_ids) {
|
186 |
+
my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
187 |
+
my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
|
188 |
+
print STDERR " $current_chart_pos/$current_linear_chart_pos node: $node_id $roman_s (@roman_chars)\n" if $verbose;
|
189 |
+
if ($#roman_chars >= 1) {
|
190 |
+
foreach $i ((1 .. $#roman_chars)) {
|
191 |
+
$current_linear_chart_pos++;
|
192 |
+
$sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i} = $current_linear_chart_pos;
|
193 |
+
$sd_ht{LINPOS2SPLITPOS}->{$chart_id}->{$current_linear_chart_pos}->{$current_chart_pos}->{$node_id}->{$i} = 1;
|
194 |
+
print STDERR " LINPOS2SPLITPOS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id I: $i\n" if $verbose;
|
195 |
+
}
|
196 |
+
}
|
197 |
+
}
|
198 |
+
$current_chart_pos++;
|
199 |
+
if ($util->member($current_chart_pos, @end_chart_positions)) {
|
200 |
+
$current_linear_chart_pos++;
|
201 |
+
$sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
|
202 |
+
$sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
|
203 |
+
print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
|
204 |
+
}
|
205 |
+
}
|
206 |
+
$current_chart_pos = 0;
|
207 |
+
while ($current_chart_pos <= $end_chart_pos) {
|
208 |
+
my $current_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
|
209 |
+
$current_linear_chart_pos = "?" unless defined($current_linear_chart_pos);
|
210 |
+
my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
|
211 |
+
# print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODES: @node_ids\n" if $verbose;
|
212 |
+
foreach $node_id (@node_ids) {
|
213 |
+
my $end_pos = $chart_ht{NODE_END}->{$node_id};
|
214 |
+
my $end_linpos = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_pos};
|
215 |
+
my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
216 |
+
my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
|
217 |
+
print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id CHARS: @roman_chars\n" if $verbose;
|
218 |
+
if (@roman_chars) {
|
219 |
+
foreach $i ((0 .. $#roman_chars)) {
|
220 |
+
my $from_linear_chart_pos
|
221 |
+
= (($i == 0)
|
222 |
+
? $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}
|
223 |
+
: $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i});
|
224 |
+
print STDERR " FROM.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $from_linear_chart_pos\n" if $verbose;
|
225 |
+
my $to_linear_chart_pos
|
226 |
+
= (($i == $#roman_chars)
|
227 |
+
? $end_linpos
|
228 |
+
: $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{($i+1)});
|
229 |
+
print STDERR " TO.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $to_linear_chart_pos\n" if $verbose;
|
230 |
+
my $roman_char = $roman_chars[$i];
|
231 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{$roman_char} = 1;
|
232 |
+
}
|
233 |
+
} else {
|
234 |
+
my $from_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
|
235 |
+
my $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+1)};
|
236 |
+
# HHERE check this out
|
237 |
+
my $i = 1;
|
238 |
+
while (! (defined($to_linear_chart_pos))) {
|
239 |
+
$i++;
|
240 |
+
$to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+$i)};
|
241 |
+
}
|
242 |
+
if (defined($from_linear_chart_pos) && defined($to_linear_chart_pos)) {
|
243 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{""} = 1
|
244 |
+
} else {
|
245 |
+
print STDERR " UNDEF.$chart_id from: "
|
246 |
+
. ((defined($from_linear_chart_pos)) ? $from_linear_chart_pos : "?")
|
247 |
+
. " to: "
|
248 |
+
. ((defined($to_linear_chart_pos)) ? $to_linear_chart_pos : "?")
|
249 |
+
. "\n";
|
250 |
+
}
|
251 |
+
}
|
252 |
+
}
|
253 |
+
$current_chart_pos++;
|
254 |
+
}
|
255 |
+
$sd_ht{MAXLINPOS}->{$chart_id} = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_chart_pos};
|
256 |
+
}
|
257 |
+
|
258 |
+
sub expand_lin_ij_roman {
|
259 |
+
local($this, *sd_ht, $chart_id, $lang_code, *ht) = @_;
|
260 |
+
|
261 |
+
foreach $start (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
|
262 |
+
foreach $end (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}}) {
|
263 |
+
foreach $roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$end}}) {
|
264 |
+
if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
|
265 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}) {
|
266 |
+
$this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht);
|
267 |
+
}
|
268 |
+
}
|
269 |
+
}
|
270 |
+
}
|
271 |
+
}
|
272 |
+
|
273 |
+
sub expand_lin_ij_roman_rec {
|
274 |
+
local($this, *sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht) = @_;
|
275 |
+
|
276 |
+
# print STDERR " expand_lin_ij_roman_rec.$chart_id $start-$end $lang_code $roman\n";
|
277 |
+
return unless $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
|
278 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman};
|
279 |
+
foreach $new_end (keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}}) {
|
280 |
+
foreach $next_roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}->{$new_end}}) {
|
281 |
+
my $exp_roman = join("", $roman, $next_roman);
|
282 |
+
if ($ht{RULE_STRING}->{$lang_code}->{$exp_roman}
|
283 |
+
|| $ht{RULE_STRING}->{""}->{$exp_roman}) {
|
284 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$new_end}->{$exp_roman} = 1;
|
285 |
+
# print STDERR " Expansion ($start-$new_end) $exp_roman\n";
|
286 |
+
}
|
287 |
+
if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$exp_roman}
|
288 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$exp_roman}) {
|
289 |
+
$this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $new_end, $exp_roman, $lang_code, *ht);
|
290 |
+
}
|
291 |
+
}
|
292 |
+
}
|
293 |
+
}
|
294 |
+
|
295 |
+
sub trace_string_distance {
|
296 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, $control, $line_number, $cost) = @_;
|
297 |
+
|
298 |
+
my $chart_comb_id = join("/", $chart1_id, $chart2_id);
|
299 |
+
return "mismatch" if $sd_ht{MISMATCH}->{$chart_comb_id};
|
300 |
+
my $chart1_end = $sd_ht{MAXLINPOS}->{$chart1_id};
|
301 |
+
my $chart2_end = $sd_ht{MAXLINPOS}->{$chart2_id};
|
302 |
+
my $verbose = ($control =~ /verbose/);
|
303 |
+
my $chunks_p = ($control =~ /chunks/);
|
304 |
+
my @traces = ();
|
305 |
+
my @s1_s = ();
|
306 |
+
my @s2_s = ();
|
307 |
+
my @e1_s = ();
|
308 |
+
my @e2_s = ();
|
309 |
+
my @r1_s = ();
|
310 |
+
my @r2_s = ();
|
311 |
+
my @ic_s = ();
|
312 |
+
|
313 |
+
# print STDERR "trace_string_distance $chart1_id $chart2_id $line_number\n";
|
314 |
+
while ($chart1_end || $chart2_end) {
|
315 |
+
my $incr_cost = $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
316 |
+
my $prec_i = $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
317 |
+
my $prec_j = $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
318 |
+
if ($incr_cost || $verbose || $chunks_p) {
|
319 |
+
my $roman1 = $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
320 |
+
my $roman2 = $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
321 |
+
if ($verbose) {
|
322 |
+
push(@traces, "$prec_i-$chart1_end/$prec_j-$chart2_end:$roman1/$roman2:$incr_cost");
|
323 |
+
} else {
|
324 |
+
if (defined($roman1)) {
|
325 |
+
push(@traces, "$roman1/$roman2:$incr_cost");
|
326 |
+
} else {
|
327 |
+
$print_prec_i = (defined($prec_i)) ? $prec_i : "?";
|
328 |
+
$print_prec_j = (defined($prec_j)) ? $prec_j : "?";
|
329 |
+
print STDERR " $prec_i-$chart1_end, $prec_j-$chart2_end\n";
|
330 |
+
}
|
331 |
+
}
|
332 |
+
if ($chunks_p) {
|
333 |
+
push(@s1_s, $prec_i);
|
334 |
+
push(@s2_s, $prec_j);
|
335 |
+
push(@e1_s, $chart1_end);
|
336 |
+
push(@e2_s, $chart2_end);
|
337 |
+
push(@r1_s, $roman1);
|
338 |
+
push(@r2_s, $roman2);
|
339 |
+
push(@ic_s, $incr_cost);
|
340 |
+
}
|
341 |
+
}
|
342 |
+
$chart1_end = $prec_i;
|
343 |
+
$chart2_end = $prec_j;
|
344 |
+
}
|
345 |
+
if ($chunks_p) {
|
346 |
+
my $r1 = "";
|
347 |
+
my $r2 = "";
|
348 |
+
my $tc = 0;
|
349 |
+
my $in_chunk = 0;
|
350 |
+
foreach $i ((0 .. $#ic_s)) {
|
351 |
+
if ($ic_s[$i]) {
|
352 |
+
$r1 = $r1_s[$i] . $r1;
|
353 |
+
$r2 = $r2_s[$i] . $r2;
|
354 |
+
$tc += $ic_s[$i];
|
355 |
+
$in_chunk = 1;
|
356 |
+
} elsif ($in_chunk) {
|
357 |
+
$chunk = "$r1/$r2/$tc";
|
358 |
+
$chunk .= "*" if $cost > 5;
|
359 |
+
$sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
|
360 |
+
$sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
|
361 |
+
$r1 = "";
|
362 |
+
$r2 = "";
|
363 |
+
$tc = 0;
|
364 |
+
$in_chunk = 0;
|
365 |
+
}
|
366 |
+
}
|
367 |
+
if ($in_chunk) {
|
368 |
+
$chunk = "$r1/$r2/$tc";
|
369 |
+
$chunk .= "*" if $cost > 5;
|
370 |
+
$sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
|
371 |
+
$sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
|
372 |
+
}
|
373 |
+
} else {
|
374 |
+
return join(" ", reverse @traces);
|
375 |
+
}
|
376 |
+
}
|
377 |
+
|
378 |
+
sub right_context_match {
|
379 |
+
local($this, $right_context_rule, *sd_ht, $chart_id, $start_pos) = @_;
|
380 |
+
|
381 |
+
return 1 if $right_context_rule eq "";
|
382 |
+
if (($right_context_item, $right_context_rest) = ($right_context_rule =~ /^\[([^\[\]]*)\]*(.*)$/)) {
|
383 |
+
my $guarded_right_context_item = $right_context_item;
|
384 |
+
$guarded_right_context_item =~ s/\$/\\\$/g;
|
385 |
+
my @end_positions = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}};
|
386 |
+
return 1 if ($#end_positions == -1)
|
387 |
+
&& (($right_context_item eq "")
|
388 |
+
|| ($right_context_item =~ /\$/));
|
389 |
+
foreach $end_pos (@end_positions) {
|
390 |
+
my @romans = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}->{$end_pos}};
|
391 |
+
foreach $roman (@romans) {
|
392 |
+
if ($roman =~ /^[$guarded_right_context_item]/) {
|
393 |
+
return $this->right_context_match($right_context_rest, *sd_ht, $chart_id, $end_pos);
|
394 |
+
}
|
395 |
+
}
|
396 |
+
}
|
397 |
+
}
|
398 |
+
return 0;
|
399 |
+
}
|
400 |
+
|
401 |
+
sub string_distance {
|
402 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control) = @_;
|
403 |
+
|
404 |
+
my $verbose = ($control =~ /verbose/i);
|
405 |
+
my $chart_comb_id = join("/", $chart1_id, $chart2_id);
|
406 |
+
|
407 |
+
my $chart1_end_pos = $sd_ht{MAXLINPOS}->{$chart1_id};
|
408 |
+
my $chart2_end_pos = $sd_ht{MAXLINPOS}->{$chart2_id};
|
409 |
+
print STDERR "string_distance.$chart_comb_id $chart1_end_pos/$chart2_end_pos\n" if $verbose;
|
410 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{0}->{0} = 0;
|
411 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{0}->{0} = "";
|
412 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{0}->{0} = "";
|
413 |
+
# HHERE
|
414 |
+
foreach $chart1_start ((0 .. $chart1_end_pos)) {
|
415 |
+
# print STDERR " C1 $chart1_start- ($chart1_start .. $chart1_end_pos)\n";
|
416 |
+
my $prev_further_expansion_possible = 0;
|
417 |
+
my @chart1_ends = sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}};
|
418 |
+
my $max_chart1_ends = (@chart1_ends) ? $chart1_ends[$#chart1_ends] : -1;
|
419 |
+
foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
|
420 |
+
my $further_expansion_possible = ($chart1_start == $chart1_end)
|
421 |
+
|| defined($sd_ht{LINPOS2SPLITPOS}->{$chart1_id}->{$chart1_start})
|
422 |
+
|| ($chart1_end < $max_chart1_ends);
|
423 |
+
my @romans1 = (($chart1_start == $chart1_end)
|
424 |
+
? ("")
|
425 |
+
: (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}->{$chart1_end}}));
|
426 |
+
if ($#romans1 == -1) {
|
427 |
+
$further_expansion_possible = 1 if $prev_further_expansion_possible;
|
428 |
+
} else {
|
429 |
+
$prev_further_expansion_possible = 0;
|
430 |
+
}
|
431 |
+
# print STDERR " C1 $chart1_start-$chart1_end romans1: @romans1 {$further_expansion_possible} *l*\n";
|
432 |
+
foreach $roman1 (@romans1) {
|
433 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} *?*\n";
|
434 |
+
next unless $ht{RULE_STRING}->{$lang_code1}->{$roman1}
|
435 |
+
|| $ht{RULE_STRING}->{""}->{$roman1};
|
436 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} ***\n";
|
437 |
+
foreach $lang_code1o (($lang_code1, "")) {
|
438 |
+
foreach $lang_code2o (($lang_code2, "")) {
|
439 |
+
my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}});
|
440 |
+
foreach $chart2_start (@chart2_starts) {
|
441 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start- (@chart2_starts)\n";
|
442 |
+
foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
|
443 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end\n";
|
444 |
+
my @romans2 = (($chart2_start == $chart2_end)
|
445 |
+
? ("")
|
446 |
+
: (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart2_id}->{$chart2_start}->{$chart2_end}}));
|
447 |
+
foreach $roman2 (@romans2) {
|
448 |
+
if ($roman1 eq $roman2) {
|
449 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2 (IDENTITY)\n";
|
450 |
+
my $cost = 0;
|
451 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
452 |
+
my $combined_cost = $preceding_cost + $cost;
|
453 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
454 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
455 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
456 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
457 |
+
$sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
|
458 |
+
$sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
|
459 |
+
$sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
|
460 |
+
$sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
|
461 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
462 |
+
= $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
|
463 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
464 |
+
= $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
|
465 |
+
$comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
466 |
+
$sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
|
467 |
+
$sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = "IDENTITY";
|
468 |
+
print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
|
469 |
+
}
|
470 |
+
} else {
|
471 |
+
next unless $ht{RULE_STRING}->{$lang_code2o}->{$roman2};
|
472 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2\n";
|
473 |
+
next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2});
|
474 |
+
my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}};
|
475 |
+
foreach $cost_rule_id (@cost_rule_ids) {
|
476 |
+
## check whether any context requirements are satisfied
|
477 |
+
# left context rules are regular expressions
|
478 |
+
my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
|
479 |
+
if ($left_context_rule1) {
|
480 |
+
my $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
481 |
+
if (defined($comb_left_roman1)) {
|
482 |
+
next unless $comb_left_roman1 =~ /$left_context_rule1/;
|
483 |
+
} else {
|
484 |
+
print STDERR " No comb_left_roman1 value for $chart_comb_id $chart1_start,$chart2_start\n";
|
485 |
+
}
|
486 |
+
}
|
487 |
+
my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
|
488 |
+
if ($left_context_rule2) {
|
489 |
+
my $comb_left_roman2 = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
490 |
+
if (defined($comb_left_roman2)) {
|
491 |
+
next unless $comb_left_roman2 =~ /$left_context_rule2/;
|
492 |
+
} else {
|
493 |
+
print STDERR " No comb_left_roman2 value for $chart_comb_id $chart1_start,$chart2_start\n";
|
494 |
+
}
|
495 |
+
}
|
496 |
+
my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
|
497 |
+
if ($right_context_rule1) {
|
498 |
+
my $match_p = $this->right_context_match($right_context_rule1, *sd_ht, $chart1_id, $chart1_end);
|
499 |
+
# print STDERR " Match?($right_context_rule1, 1, $chart1_end) = $match_p\n";
|
500 |
+
next unless $match_p;
|
501 |
+
}
|
502 |
+
my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
|
503 |
+
if ($right_context_rule2) {
|
504 |
+
my $match_p = $this->right_context_match($right_context_rule2, *sd_ht, $chart2_id, $chart2_end);
|
505 |
+
# print STDERR " Match?($right_context_rule2, 2, $chart2_end) = $match_p\n";
|
506 |
+
next unless $match_p;
|
507 |
+
}
|
508 |
+
my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}->{$cost_rule_id};
|
509 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
510 |
+
my $combined_cost = $preceding_cost + $cost;
|
511 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
512 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
513 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
514 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
515 |
+
$sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
|
516 |
+
$sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
|
517 |
+
$sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
|
518 |
+
$sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
|
519 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
520 |
+
= $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
|
521 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
522 |
+
= $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
|
523 |
+
$comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
524 |
+
# print STDERR " Comb-left-roman1($chart_comb_id,$chart1_end,$chart2_end) = $comb_left_roman1\n";
|
525 |
+
$sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
|
526 |
+
$sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost_rule_id;
|
527 |
+
print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
|
528 |
+
}
|
529 |
+
}
|
530 |
+
}
|
531 |
+
}
|
532 |
+
}
|
533 |
+
}
|
534 |
+
}
|
535 |
+
}
|
536 |
+
$further_expansion_possible = 1
|
537 |
+
if $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code1}->{$roman1}
|
538 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman1};
|
539 |
+
# print STDERR " further_expansion_possible: $further_expansion_possible (lc: $lang_code1 r1: $roman1) ***\n";
|
540 |
+
}
|
541 |
+
# print STDERR " last C1 $chart1_start-$chart1_end (@romans1)\n" unless $further_expansion_possible;
|
542 |
+
last unless $further_expansion_possible;
|
543 |
+
$prev_further_expansion_possible = 1 if $further_expansion_possible;
|
544 |
+
}
|
545 |
+
}
|
546 |
+
my $total_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end_pos}->{$chart2_end_pos};
|
547 |
+
unless (defined($total_cost)) {
|
548 |
+
$total_cost = 99.9999;
|
549 |
+
$sd_ht{MISMATCH}->{$chart_comb_id} = 1;
|
550 |
+
}
|
551 |
+
return $total_cost;
|
552 |
+
}
|
553 |
+
|
554 |
+
sub print_sd_ht {
|
555 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, *OUT) = @_;
|
556 |
+
|
557 |
+
print OUT "string-distance chart:\n";
|
558 |
+
foreach $chart_id (($chart1_id, $chart2_id)) {
|
559 |
+
print OUT "SD chart $chart_id:\n";
|
560 |
+
foreach $from_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
|
561 |
+
foreach $to_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}}) {
|
562 |
+
foreach $roman_char (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}}) {
|
563 |
+
print OUT " Lnode($from_linear_chart_pos-$to_linear_chart_pos): $roman_char\n";
|
564 |
+
}
|
565 |
+
}
|
566 |
+
}
|
567 |
+
}
|
568 |
+
}
|
569 |
+
|
570 |
+
sub print_chart_ht {
|
571 |
+
local($this, *chart_ht, *OUT) = @_;
|
572 |
+
|
573 |
+
print OUT "uroman chart:\n";
|
574 |
+
foreach $start (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AT}}) {
|
575 |
+
foreach $end (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}) {
|
576 |
+
foreach $node_id (keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
|
577 |
+
$roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
578 |
+
print OUT " Node $node_id ($start-$end): $roman_s\n";
|
579 |
+
}
|
580 |
+
}
|
581 |
+
}
|
582 |
+
}
|
583 |
+
|
584 |
+
sub normalize_string {
|
585 |
+
local($this, $s) = @_;
|
586 |
+
|
587 |
+
# $s =~ s/(\xE2\x80\x8C)//g; # delete zero width non-joiner
|
588 |
+
$s =~ s/(\xE2\x80[\x93-\x94])/-/g; # en-dash, em-dash
|
589 |
+
$s =~ s/([\x00-\x7F\xC0-\xFE][\x80-\xBF]*)\1+/$1$1/g; # shorten 3 or more occurrences of same character in a row to 2
|
590 |
+
$s =~ s/[ \t]+/ /g;
|
591 |
+
|
592 |
+
return $s;
|
593 |
+
}
|
594 |
+
|
595 |
+
my $string_distance_chart_id = 0;
|
596 |
+
sub string_distance_by_chart {
|
597 |
+
local($this, $s1, $s2, $lang_code1, $lang_code2, *ht, *pinyin_ht, $control) = @_;
|
598 |
+
|
599 |
+
$control = "" unless defined($control);
|
600 |
+
%sd_ht = ();
|
601 |
+
|
602 |
+
$s1 = $this->normalize_string($s1);
|
603 |
+
my $lc_s1 = $utf8->extended_lower_case($s1);
|
604 |
+
$string_distance_chart_id++;
|
605 |
+
my $chart1_id = $string_distance_chart_id;
|
606 |
+
*chart_ht = $romanizer->romanize($lc_s1, $lang_code1, "", *ht, *pinyin_ht, 0, "return chart", $chart1_id);
|
607 |
+
$this->linearize_chart_points(*chart_ht, $chart1_id, *sd_ht);
|
608 |
+
$this->expand_lin_ij_roman(*sd_ht, $chart1_id, $lang_code1, *ht);
|
609 |
+
|
610 |
+
$s2 = $this->normalize_string($s2);
|
611 |
+
my $lc_s2 = $utf8->extended_lower_case($s2);
|
612 |
+
$string_distance_chart_id++;
|
613 |
+
my $chart2_id = $string_distance_chart_id;
|
614 |
+
*chart_ht = $romanizer->romanize($lc_s2, $lang_code2, "", *ht, *pinyin_ht, 0, "return chart", $chart2_id);
|
615 |
+
$this->linearize_chart_points(*chart_ht, $chart2_id, *sd_ht);
|
616 |
+
$this->expand_lin_ij_roman(*sd_ht, $chart2_id, $lang_code2, *ht);
|
617 |
+
|
618 |
+
my $cost = $this->string_distance(*sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control);
|
619 |
+
return $cost;
|
620 |
+
}
|
621 |
+
|
622 |
+
my $n_quick_romanized_string_distance = 0;
|
623 |
+
sub quick_romanized_string_distance_by_chart {
|
624 |
+
local($this, $s1, $s2, *ht, $control, $lang_code1, $lang_code2) = @_;
|
625 |
+
|
626 |
+
# my $verbose = ($s1 eq "apit") && ($s2 eq "apet");
|
627 |
+
# print STDERR "Start quick_romanized_string_distance_by_chart\n";
|
628 |
+
$s1 = lc $s1;
|
629 |
+
$s2 = lc $s2;
|
630 |
+
$control = "" unless defined($control);
|
631 |
+
$lang_code1 = "" unless defined($lang_code1);
|
632 |
+
$lang_code2 = "" unless defined($lang_code2);
|
633 |
+
my $cache_p = ($control =~ /cache/);
|
634 |
+
my $total_cost;
|
635 |
+
if ($cache_p) {
|
636 |
+
$total_cost = $ht{CACHED_QRSD}->{$s1}->{$s2};
|
637 |
+
if (defined($total_cost)) {
|
638 |
+
return $total_cost;
|
639 |
+
}
|
640 |
+
}
|
641 |
+
my @lang_codes1 = ($lang_code1 eq "") ? ("") : ($lang_code1, "");
|
642 |
+
my @lang_codes2 = ($lang_code2 eq "") ? ("") : ($lang_code2, "");
|
643 |
+
my $chart1_end_pos = length($s1);
|
644 |
+
my $chart2_end_pos = length($s2);
|
645 |
+
my %sd_ht = ();
|
646 |
+
$sd_ht{COST_IJ}->{0}->{0} = 0;
|
647 |
+
foreach $chart1_start ((0 .. $chart1_end_pos)) {
|
648 |
+
foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
|
649 |
+
my $substr1 = substr($s1, $chart1_start, ($chart1_end-$chart1_start));
|
650 |
+
foreach $lang_code1o (@lang_codes1) {
|
651 |
+
foreach $lang_code2o (@lang_codes2) {
|
652 |
+
# next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1});
|
653 |
+
}
|
654 |
+
}
|
655 |
+
my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart1_start}});
|
656 |
+
foreach $chart2_start (@chart2_starts) {
|
657 |
+
foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
|
658 |
+
my $substr2 = substr($s2, $chart2_start, ($chart2_end-$chart2_start));
|
659 |
+
foreach $lang_code1o (@lang_codes1) {
|
660 |
+
foreach $lang_code2o (@lang_codes2) {
|
661 |
+
if ($substr1 eq $substr2) {
|
662 |
+
my $cost = 0;
|
663 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
|
664 |
+
if (defined($preceding_cost)) {
|
665 |
+
my $combined_cost = $preceding_cost + $cost;
|
666 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
|
667 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
668 |
+
$sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
669 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
670 |
+
}
|
671 |
+
}
|
672 |
+
} else {
|
673 |
+
next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2});
|
674 |
+
my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}};
|
675 |
+
my $best_cost = 99.99;
|
676 |
+
foreach $cost_rule_id (@cost_rule_ids) {
|
677 |
+
my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}->{$cost_rule_id};
|
678 |
+
my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
|
679 |
+
next if $left_context_rule1
|
680 |
+
&& (! (substr($s1, 0, $chart1_start) =~ /$left_context_rule1/));
|
681 |
+
my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
|
682 |
+
next if $left_context_rule2
|
683 |
+
&& (! (substr($s2, 0, $chart2_start) =~ /$left_context_rule2/));
|
684 |
+
my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
|
685 |
+
my $right_context1 = substr($s1, $chart1_end);
|
686 |
+
next if $right_context_rule1
|
687 |
+
&& (! (($right_context1 =~ /^$right_context_rule1/)
|
688 |
+
|| (($right_context_rule1 =~ /^\[[^\[\]]*\$/)
|
689 |
+
&& ($right_context1 eq ""))));
|
690 |
+
my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
|
691 |
+
my $right_context2 = substr($s2, $chart2_end);
|
692 |
+
next if $right_context_rule2
|
693 |
+
&& (! (($right_context2 =~ /^$right_context_rule2/)
|
694 |
+
|| (($right_context_rule2 =~ /^\[[^\[\]]*\$/)
|
695 |
+
&& ($right_context2 eq ""))));
|
696 |
+
$best_cost = $cost if $cost < $best_cost;
|
697 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
|
698 |
+
my $combined_cost = $preceding_cost + $cost;
|
699 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
|
700 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
701 |
+
$sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
702 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
703 |
+
}
|
704 |
+
}
|
705 |
+
}
|
706 |
+
}
|
707 |
+
}
|
708 |
+
}
|
709 |
+
}
|
710 |
+
}
|
711 |
+
}
|
712 |
+
$total_cost = $sd_ht{COST_IJ}->{$chart1_end_pos}->{$chart2_end_pos};
|
713 |
+
$total_cost = 99.99 unless defined($total_cost);
|
714 |
+
$ht{CACHED_QRSD}->{$s1}->{$s2} = $total_cost if $cache_p;
|
715 |
+
$n_quick_romanized_string_distance++;
|
716 |
+
return $total_cost;
|
717 |
+
}
|
718 |
+
|
719 |
+
sub get_n_quick_romanized_string_distance {
|
720 |
+
return $n_quick_romanized_string_distance;
|
721 |
+
}
|
722 |
+
|
723 |
+
1;
|
724 |
+
|
uroman/lib/NLP/utilities.pm
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/tarballs/uroman-v1.0.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:912655beef069e5abb43c8fc4c3c4428fd0af6f4a1697accc98277933d3e1ee5
|
3 |
+
size 440252
|
uroman/tarballs/uroman-v1.1.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df990f6096a10e093ac5f28c2b86d5ef9e9098ef7472855843f9a841bb3b963d
|
3 |
+
size 507234
|
uroman/tarballs/uroman-v1.2.4.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77d707f3c17d5c45869b80fe71caee6023d1d9949ccffb446626f374605a25e2
|
3 |
+
size 503690
|
uroman/tarballs/uroman-v1.2.5.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2e9044afff8b4483f43a99b1fb1279889336760d76245ee93f300e660a46660
|
3 |
+
size 575581
|
uroman/tarballs/uroman-v1.2.6.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02f6f73b067b972a8f7d408da2f9b22741629af67f55b2ea768d11710fbf40a4
|
3 |
+
size 567522
|
uroman/tarballs/uroman-v1.2.7.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbb51506ed3ea6dcb902c824e62bea39b3741f6526564ba05d6e0083d8d876e5
|
3 |
+
size 566800
|
uroman/tarballs/uroman-v1.2.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c69e56d9c5eea9416ae00ca4dd859a1ef5129c1867778b66ad2f811f0fd33c9
|
3 |
+
size 494625
|
uroman/test/multi-script.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::lcode deu Grüße aus Bordeaux
|
2 |
+
::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.
|
3 |
+
::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲
|
4 |
+
::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
|
5 |
+
::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].
|
6 |
+
::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року.
|
7 |
+
::lcode srp Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.
|
8 |
+
::lcode ara كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
|
9 |
+
::lcode fas کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لسآنجلس، سن دیگو، سن خوزه و سانفرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
|
10 |
+
::lcode uig ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىنگتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېنگلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىنگىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى.
|
11 |
+
::lcode amh ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
|
12 |
+
::lcode hin कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं।
|
13 |
+
::lcode mar लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महान���र क्षेत्र आहे.
|
14 |
+
::lcode nep यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ ।
|
15 |
+
::lcode tam தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது.
|
16 |
+
::lcode mal ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം.
|
17 |
+
::lcode ori ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି ।
|
18 |
+
::lcode zho 加拿大在一万四千年前即有原住民在此生活。
|
19 |
+
::lcode heb כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.
|
20 |
+
::lcode yid דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס"ז (יולי 2006).
|
21 |
+
::lcode hye Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն
|
22 |
+
::lcode tai มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์
|
23 |
+
북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다.
|
24 |
+
ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||
|
25 |
+
ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა.
|
26 |
+
᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜
|
27 |
+
ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
|
28 |
+
𓊪𓏏𓍯𓃭𓐝𓇌𓋴
|
29 |
+
チェコスロバキア
|
30 |
+
ལྷ་ས་གྲ���ང་ཁྱེར
|
31 |
+
ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ.
|
32 |
+
ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.
|
uroman/test/multi-script.uroman-ref.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::lcode deu Gruesse aus Bordeaux
|
2 |
+
::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.
|
3 |
+
::lcode eng We hold ⠘e truos to ; self-evid⠢t, t all men aee cr,te equal, t ey aee endoee by ⠸e Creator u cita⠔ unalienable ⠠⠐rs, t amg ⠘e aee Life, Libity ⠯ e pursuit a Happis.
|
4 |
+
::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.
|
5 |
+
::lcode rus Germaniya (nem. Deutschland), ofitsialnoe nazvanie — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoi Evrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — bolee 80 millionov chelovek. [2][6].
|
6 |
+
::lcode ukr Volodimir Oleksandrovich Zelensky (nar. 25 sichnya 1978, Krivy Rig) — ukrayinsky derzhavny diyach, politik, shoumen, aktor, komik, rezhiser, prodyuser ta stsenarist, shosty Prezident Ukrayini z 20 travnya 2019 roku.
|
7 |
+
::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u dukhu bratstva.
|
8 |
+
::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm.
|
9 |
+
::lcode fas kalifrnia (bh anglisi: California) ialti dr ghrb amrika br kranh' aqianws aram ast. mrkz an sakramntw w shhrhai mhm an lsanjls, sn digw, sn khwzh w sanfransiskw hstnd.hmtchnin in ialt pr jm'it trin ialt amrika ast.
|
10 |
+
::lcode uig yeameraka qwshma shtatlara bwlsa shamalay yeamerakagha jaylashqan bar doelaet. yeunang paytaekhta bwlsa vashangtwn, yeaeng tchwng shaehara bwlsa nyuywrk shaehara. doelaet tala bwlsa yeenglaztala. hazarqa zungtung barak yewbama. bu doelaet yeaesladae yeaengalayaenang mustaemlakasa bwlup vashangatwnnang raehpaerlakadae 1776 yala 7 yeaynang 4 kuna mustaeqal bwlghan, yaer maeydana 9 malaywn 826 mang 630 kvadarat klwmetar, nwpusa 306 mallaywn 142 mang, bularnang yeassaslaq dana khrastayean dana.
|
11 |
+
::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate.
|
12 |
+
::lcode hin kailiphorniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiphorniyaa praayadviip, nevaaddaa, yuuttaa aur erijonaa, nayaa meksiko, aur vaayomimga ke kaii vibhaag sthit haim.
|
13 |
+
::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe.
|
14 |
+
::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha .
|
15 |
+
::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu.
|
16 |
+
::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam.
|
17 |
+
::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti .
|
18 |
+
::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo.
|
19 |
+
::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim.
|
20 |
+
::lcode yid dvvqa ayz an h'vr'ysh'r zshvrnal vvas vashryyvt dy yydysh-shfrakyq' qvltvr. 's ayz d'rshyn'n g'vvarn tmvz h'tshs"z (yvly 2006).
|
21 |
+
::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun
|
22 |
+
::lcode tai miipratesisra 2 prates kuee saanmaariinolaeankrratwaatikan peondindaentiilomrobpaidwypueentiikongitaalii naiknatiimeueengkampiionediitaaleiiy peondindaenswnyaekkongitaaliitiituuklomrobdwypueentiipratesswitserlaend
|
23 |
+
bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda.
|
24 |
+
baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi ||
|
25 |
+
vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa.
|
26 |
+
anm moilegoimrid maki vekumen
|
27 |
+
ic mag glas eotan ond hit ne hearmiath me.
|
28 |
+
ptolmys
|
29 |
+
chekosurobakia
|
30 |
+
lha·sa·grong·khyer
|
31 |
+
saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni.
|
32 |
+
amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat.
|
uroman/test/string-similarity-test-input.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
trap strap
|
2 |
+
colour color
|
3 |
+
labeling labelling
|
4 |
+
organisation organization
|
5 |
+
Philadelphia Filadelfia
|
6 |
+
Vladimir Volodymyr
|
7 |
+
Moskva Moskvoy
|
uroman/test/string-similarity-test-output-ref.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Lang-code-1: eng Lang-code-2: eng
|
2 |
+
trap strap 1
|
3 |
+
colour color 0.1
|
4 |
+
labeling labelling 0.02
|
5 |
+
organisation organization 0.1
|
6 |
+
Philadelphia Filadelfia 0.02
|
7 |
+
Vladimir Volodymyr 0.5
|
8 |
+
Moskva Moskvoy 0.5
|
uroman/text/amh.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
|
2 |
+
ክርስትናን በአራተኛው ምዕተ-ዓመት ተቀብላለች።
|
3 |
+
ከሕዝቡ አንድ ሶስተኛው እስላም ነው።
|
4 |
+
የመጀመሪያው የእስላም ሂጅራ ወደ ኢትዮጵያ ነው የተከናወነው።
|
5 |
+
ነጋሽ በአፍሪካ የመጀመሪያው የእስላም መቀመጫ ናት።
|
6 |
+
እስከ ፲፱፻፸ ዎቹ ድረስ ብዙ ቤተ-እስራኤሎች በኢትዮጵያ ይኖሩ ነበር።
|
7 |
+
የራስ ተፈሪ እንቅስቃሴ ኢትዮጵያን በትልቅ ክብር ነው የሚያያት።
|
uroman/text/ara.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
|
2 |
+
أراضي كندا مأهولة منذ آلاف السنين من قبل مجموعات مختلفة من السكان الأصليين. مع حلول أواخر القرن الخامس عشر بدأت الحملات البريطانية والفرنسية استكشاف المنطقة ومن ثم استوطنتها على طول ساحل المحيط الأطلسي. تنازلت فرنسا عن ما يقرب من جميع مستعمراتها في أمريكا الشمالية في عام 1763 بعد حرب السنوات السبع. في عام 1867، مع اتحاد ثلاثة مستعمرات بريطانية في أمريكا الشمالية عبر كونفدرالية تشكلت كندا باعتبارها كيانًا فدراليًا ذا سيادة يضم أربع مقاطعات. بدأ ذلك عملية اتسعت فيها مساحة كندا وتوسع حكمها الذاتي عن المملكة المتحدة. تجلت هذه الاستقلالية من خلال تشريع وستمنستر عام 1931 وبلغت ذروتها في صورة قانون كندا عام 1982 والذي قطع الاعتماد القانوني لكندا على البرلمان البريطاني.
|
3 |
+
كندا دولة فيدرالية يحكمها نظام ديمقراطي تمثيلي وملكية دستورية حيث الملكة إليزابيث الثانية قائدة للدولة. الأمة الكندية أمة ثنائية اللغة حيث الإنكليزية والفرنسية لغتان رسميتان على المستوى الاتحادي. تعد كندا واحدة من أكثر دول العالم تطوراً، حيث تمتلك اقتصاداً متنوعاً وتعتمد على مواردها الطبيعية الوفيرة، وعلى التجارة وبخاصة مع الولايات المتحدة اللتان تربطهما علاقة طويلة ومعقدة. كندا عضو في مجموعة الدول الصناعية السبع ومجموعة الثماني ومجموعة العشرين وحلف شمال الأطلسي ومنظمة التعاون والتنمية الاقتصادية ومنظمة التجارة العالمية ودول الكومنولث والفرنكوفونية ومنظمة الدول الأمريكية والإبيك والأمم المتحدة. تمتلك كندا واحداً من أعلى مستويات المعيشة في العالم حيث مؤشر التنمية البشرية يضعها في المرتبة الثامنة عالمياً.
|
uroman/text/ben.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
বার্লিন (জার্মান: Berlin বেয়ালিন্') জার্মানির রাজধানী, এবং ইউরোপ মহাদেশের একটি ঐতিহাসিক শহর। বার্লিন শহরে ৩৪ লক্ষেরও বেশি লোক বাস করেন। শহরটি একাধারে একটি শহর এবং জার্মানির একটি রাজ্য। বার্লিনের আয়তন ৩৪৩ বর্গমাইল; এটির আয়তন প্যারিস শহরের প্রায় ৯ গুণ।
|
2 |
+
বার্লিন একটি বহুসাংস্কৃতিক শহর। বিশ্বের ১৮৪টি দেশ থেকে আগত প্রায় ৪ লক্ষ ৩০ হাজার অভিবাসী বার্লিনে বাস করে। এদের মধ্যে তুরস্ক থেকে আগত অভিবাসীরা সংখ্যা সবচেয়ে বেশি; বার্লিনে প্রায় ১ লক্ষ ১৯ হাজার তুর্কি অভিবাসী বাস করে। তুরস্কের বাইরে বার্লিনেই ইউরোপে তুর্কিদের সবচেয়ে বড় সম্প্রদায় অবস্থিত।
|
3 |
+
১৯৪৯ সাল থেকে ১৯৯০ পর্যন্ত বার্লিন পূর্ব বার্লিন ও পশ্চিম বার্লিন---এই দুই ভাগে বিভক্ত ছিল। ১৯৬১ সালে পূর্ব জার্মান সরকার সেখানকার নাগরিকদের পশ্চিম বার্লিনে পালিয়ে যাওয়া ঠেকাতে দুই বার্লিনের মাঝে একটি দেয়াল তুলে দেয়। দেয়ালটি ১৯৬১ সাল থেকে ১৯৮৯ সাল পর্যন্ত টিকে ছিল। ঐ সময় ৫ হাজারেরও বেশি ব্যক্তি দেয়ালটি টপকানোর চেষ্টা করে; এদের মধ্যে ৩২০০ জনকে গ্রেফতার করা হয় এবং ১৯১ জন নিহত হয়।
|
4 |
+
১৯৮৯ সালে দেয়ালটি ভেঙে ফেলার পর বার্লিনের ব্রান্ডেনবুর্গ ফটক পূর্ব ও পশ্চিম বার্লিনের পুনঃএকত্রীকরণের প্রতীক হিসেবে দাঁড়িয়ে আছে।
|
5 |
+
বার্লিনের স্থানীয় ফুটবল দলের নাম হের্টা বে এস ৎসে বের্লিন। তারা ঘরোয়া ম্যাচগুলি বার্লিনের "অলিম্পিয়াষ্টাডিয়ন" নামের স্টেডিয়ামে খেলে থাকে। এই স্টেডিয়ামেই ১৯৩৬ সালের গ্রীষ্মকালীন অলিম্পিক্স অনুষ্ঠিত হয়।
|
6 |
+
বার্লিনে কুকুর পোষা খুবই ব্যয়বহুল একটি কাজ। কুকুরের মালিককে প্রতি বছর দেড়শ ইউরো কর দিতে হয়।
|
7 |
+
বার্লিনের কাউফ্হাউস ডেস ভেস্টেন্স (Kaufhaus des Westens, সংক্ষেপে KaDeWe, কাডেভে) ইউরোপের বৃহত্তম ডিপার্টমেন্ট স্টোর। এর আট তলাবিশিষ্ট ভবনে প্রায় ৪ লক্ষ জিনিস বেচা কেনা হয়।
|
8 |
+
মার্কিন যুক্তরাষ্ট্রের লস অ্যাঞ্জেলেস বার্লিনের ভগ্নী শহর।
|
uroman/text/bod.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
ཁྲིན་ཀོན་ཆུས
|
2 |
+
ལྷ་ས་གྲོང་ཁྱེར
|
3 |
+
[[ཁྲིན་ཀོན་ཆུས་ཞེས་པ་ནི་རྒྱ་ནག་གཞུང་གིས་བཙན་འཛུལ་བྱས་རྗེས་བཏགས་པའི་མིང་ཞིག་ཡིན་པ་དང། དེ་ནི་ད་ལྟའི་ཆར་ལྷ་ས་གྲོང་ཁྱེར་གྱི་ཁོངས་གཏོགས་རྫོང་ཁག་བདུན་པོ་ཕུད་པའི་གྲོང་ཁྱེར་ནང་ཁུལ་གྱི་ས་ཁུལ་ཁག་བསྡུས་པའི་གནས་དེར་ཁྲེང་ཀོན་ཆུས་ཞེས་པའི་ཁོངས་སུ་གཏོགས་པར་བཤད་ཡོད་ཅིང། ནུབ་ཏུ་སྟོད་ལུང་ས་འབྲེལ་འབྲས་སྤུངས་དན་བག་ཡན་དང་ཤར་དུ་གཤོངས་ཀ་གླིང་ཡན་ཙམ་དུ་ཡིན་ཚོད་འདུག]]
|
uroman/text/egy.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
𓈎𓃭𓇋𓍯𓊪𓄿𓆓𓂋𓄿𓏏𓆇
|
2 |
+
𓊪𓏏𓍯𓃭𓐝𓇌𓋴
|
3 |
+
𓆿𓍧𓎇𓏻
|
4 |
+
𓇌𓊪𓏲𓌙𓈉
|
5 |
+
|
uroman/text/ell.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
|
2 |
+
Βρίσκεται στη δυτική ακτή των Η.Π.Α., στην πολιτεία της Καλιφόρνιας.
|
3 |
+
Έχει 3,85 εκατομμύρια κατοίκους σύμφωνα με εκτίμηση του 2006 και έκταση 1.214,9 τετραγωνικών χιλιομέτρων.
|
4 |
+
Η αχανής μητροπολιτική περιοχή του Λος Άντζελες εκτιμάται ότι αριθμεί περίπου 13 εκατομμύρια κατοίκους, οι οποίοι αποκαλούνται Angelenos.
|
5 |
+
Η πόλη αποτελεί ένα από τα πιο κοσμοπολίτικα μέρη στον κόσμο, καθώς κατοικούν άνθρωποι προερχόμενοι από κάθε γωνιά της γης, που προσελκύονται από το ευχάριστο κλίμα, τον έντονο και γεμάτο ενέργεια τρόπο ζωής αλλά και την υπόσχεση του αμερικανικού ονείρου.
|
6 |
+
|
7 |
+
Γερούν Ντάισελμπλουμ
|
8 |
+
Γιώργος Κωνσταντινίδης
|
uroman/text/fas.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لسآنجلس، سن دیگو، سن خوزه و سانفرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
|
2 |
+
نام این ایالت از زبان اسپانیولی و به خصوص از رمانی به نام Las sergas de Esplandián گرفته شده، و متعلق به شخصیتی از این داستان است به نام ملکه Califia که احتمالاً از واژهٔ عربی «خلیفه» گرفته شده.[۲]
|
3 |
+
کالیفرنیا پرجمعیتترین ایالت ایالات متحده آمریکاست و نیز بزرگترین جمعیت ایرانی تبار خارج از خاور میانه را در خود جای داده است[۳] به طوری که چندین تن از اعضای شورای شهر بورلی هیلز ایرانیالاصل هستند.
|
4 |
+
کالیفرنیا نهمین اقتصاد جهان است. در سال ۲۰۱۲، این ایالت تولید ناخالص داخلی برابر با ۱٬۹۵۸٬۹۰۴تریلیون دلار داشت، که نزدیک به تولید ناخالص داخلی کشور ایتالیا (۲٬۰۱۳٬۳۷۵ میلیون دلار) بود.[۴]
|
5 |
+
کالیفرنیا صنعت فناوری اطلاعات و رایانهای بسیار پیشرفتهای دارد به طوری که شرکتهای اوراکل، سیسکو سیستمز، اینتل، گوگل، یاهو، شرکت ایامدی، سان مایکروسیستمز، و نیز شرکت رایانهای اپل و شِوران نیز در این ایالت مرکزیت دارند.
|
6 |
+
علاوه بر این، دو عدد از آزمایشگاههای فدرال بزرگ آمریکا در این ایالت قرار دارند از جمله آزمایشگاه ملی لارنس لیورمور و آزمایشگاه ملی لارنس برکلی.
|
uroman/text/heb.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
כֹּל עוֹד בַּלֵּבָב פְּנִימָה
|
2 |
+
נֶפֶשׁ יְהוּדִי הוֹמִיָּה
|
3 |
+
וּלְפַאֲתֵי מִזְרָח, קָדִימָה,
|
4 |
+
עַיִן לְצִיּוֹן צוֹפִיָּה,
|
5 |
+
עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ,
|
6 |
+
הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם
|
7 |
+
לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ,
|
8 |
+
אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.
|
9 |
+
|
10 |
+
ניוּ יוֹרק (באנגלית: New York City, בקיצור NYC, או New York בלבד) היא העיר המאוכלסת ביותר בארצות הברית. שטחה העירוני הוא אחד מן הגדולים בעולם. כינויה של העיר הוא "התפוח הגדול". כבר למעלה ממאה שנים מהווה ניו יורק מרכז מסחרי וכלכלי מהגדולים בעולם. העיר מדורגת כעיר עולם מסוג אלפא (היא ולונדון הן היחידות המוגדרות כאלפא ++), בשל השפעתה העולמית על התקשורת, הפוליטיקה, החינוך, הבידור, התרבות והאופנה. מרכז התרבות האמנותי של העיר הוא בעל השפעה רבה בכל רחבי ארצות הברית ומחוצה לה. העיר היא מרכז לפעילות בינלאומית, ושוכן בה מטהו של ארגון האומות המאוחדות. ראש העיר החל מתחילת 2014 הוא ביל דה בלאזיו.
|
11 |
+
העיר ניו יורק מורכבת מחמישה רבעים: הברונקס, ברוקלין, מנהטן, קווינס וסטטן איילנד. אוכלוסייתה מונה מעל 8.2 מיליון תושבים, המשתכנים בשטח של 1,214 קילומטר רבוע, והיא הצפופה מבין ערי ארצות הברית.
|
12 |
+
רבות משכונות העיר ומאתריה ידועים ברחבי העולם. פסל החירות קידם את פניהם של מיליוני המהגרים בבואם לאמריקה בשלהי המאה ה-19 ובתחילת המאה ה-20. וול סטריט, השוכן במנהטן תחתית, מהווה מרכז פיננסי עולמי מאז סוף מלחמת העולם השנייה ומשכנה של הבורסה לניירות ערך. בעיר ממוקמים כמה מהבניינים הגבוהים בעולם, כמו בניין האמפייר סטייט, וכן שכנו בה מגדלי התאומים שנהרסו בפיגועי 11 בספטמבר.
|
13 |
+
ניו יורק היא מקום היווסדן של תנועות תרבות אמריקאיות רבות, כמו תנועת הספרות והאמנות הידועה בשם "הרנסאנס של הארלם", סגנון הציור האקספרסיוניסטי המופשט (שידוע גם כאסכולת ניו יורק), והסגנונות המוזיקליים היפ הופ, פאנק וסלסה. 36 אחוזים מאוכלוסייתה נולדו מחוץ לארצות הברית, ובשנת 2005 דוברו בה כמעט 170 שפות שונות. הרכבת התחתית של ניו יורק פועלת 24 שעות ביממה, רחובותיה סואנים בהולכי רגל ובמכוניות והיא מכונה "העיר שאינה ישנה לעולם".
|
14 |
+
|
15 |
+
אֱלוֹהַּ יֵשׁוּעַ ר֫וּחַ
|