Unicode/UCD.pm# This function has traditionally mimicked what is in UnicodeData.txt,
# warts and all. This is a re-write that avoids UnicodeData.txt so that
# it can be removed to save disk space. Instead, this assembles
# information gotten by other methods that get data from various other
# files. It uses charnames to get the character name; and various
# mktables tables.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
# siehe http://www.unicode.org/reports/tr44/#UnicodeData.txt my @FIELDS = qw( Codepoint Name General_Category Canonical_Combining_Class Bidi_Class Decomposition_Type_Mapping IF_Numeric_Type_Decimal If_Numeric_Type_Digit If_Numeric_Type_Numeric Bidi_Mirrored Unicode_1_Name ISO_Comment Simple_Uppercase_Mapping Simple_Lowercase_Mapping Simple_Titlecase_Mapping ); my %CANONICAL_COMBINING_CLASS_VALUES = ( 0 => 'Not_Reordered, Spacing and enclosing marks; also many vowel and consonant signs, even if nonspacing', 1 => 'Overlay, Marks which overlay a base letter or symbol', 7 => 'Nukta, Diacritic nukta marks in Brahmi-derived scripts', 8 => 'Kana_Voicing, Hiragana/Katakana voicing marks', 9 => 'Virama, Viramas', 10 => 'Ccc10, Start of fixed position classes', 199 => 'End of fixed position classes', 200 => 'Attached_Below_Left, Marks attached at the bottom left', 202 => 'Attached_Below, Marks attached directly below', 204 => 'Marks attached at the bottom right', 208 => 'Marks attached to the left', 210 => 'Marks attached to the right', 212 => 'Marks attached at the top left', 214 => 'Attached_Above, Marks attached directly above', 216 => 'Attached_Above_Right, Marks attached at the top right', 218 => 'Below_Left, Distinct marks at the bottom left', 220 => 'Below, Distinct marks directly below', 222 => 'Below_Right, Distinct marks at the bottom right', 224 => 'Left, Distinct marks to the left', 226 => 'Right, Distinct marks to the right', 228 => 'Above_Left, Distinct marks at the top left', 230 => 'Above, Distinct marks directly above', 232 => 'Above_Right, Distinct marks at the top right', 233 => 'Double_Below, Distinct marks subtending two bases', 234 => 'Double_Above, Distinct marks extending above two bases', 240 => 'Iota_Subscript, Greek iota subscript only' ); my %BIDI_CLASS_VALUES = ( L => 'Left_To_Right, any strong left-to-right character', R => 'Right_To_Left, any strong right-to-left (non-Arabic-type) character', AL => 'Arabic_Letter, any strong right-to-left (Arabic-type) character', EN => 'European_Number, any ASCII digit or Eastern Arabic-Indic digit', ES => 'European_Separator, plus and minus signs', ET => 'European_Terminator, a terminator in a numeric format context, includes currency signs', AN => 'Arabic_Number, any Arabic-Indic digit', CS => 'Common_Separator, commas, colons, and slashes', NSM => 'Nonspacing_Mark, any nonspacing mark', BN => 'Boundary_Neutral, most format characters, control codes, or noncharacters', B => 'Paragraph_Separator, various newline characters', S => 'Segment_Separator, various segment-related control codes', WS => 'White_Space, spaces', ON => 'Other_Neutral, most other symbols and punctuation marks', LRE => 'Left_To_Right_Embedding, U+202A: the LR embedding control', LRO => 'Left_To_Right_Override, U+202D: the LR override control', RLE => 'Right_To_Left_Embedding, U+202B: the RL embedding control', RLO => 'Right_To_Left_Override, U+202E: the RL override control', PDF => 'Pop_Directional_Format, U+202C: terminates an embedding or override control', LRI => 'Left_To_Right_Isolate, U+2066: the LR isolate control', RLI => 'Right_To_Left_Isolate, U+2067: the RL isolate control', FSI => 'First_Strong_Isolate, U+2068: the first strong isolate control', PDI => 'Pop_Directional_Isolate, U+2069: terminates an isolate control' ); my %COMPATIBILITY_TAGS = ( font => 'Font variant (for example, a blackletter form)', noBreak => 'No-break version of a space or hyphen', initial => 'Initial presentation form (Arabic)', medial => 'Medial presentation form (Arabic)', final => 'Final presentation form (Arabic)', isolated => 'Isolated presentation form (Arabic)', circle => 'Encircled form', super => 'Superscript form', sub => 'Subscript form', vertical => 'Vertical layout presentation form', wide => 'Wide (or zenkaku) compatibility character', narrow => 'Narrow (or hankaku) compatibility character', small => 'Small variant form (CNS compatibility)', square => 'CJK squared font variant', fraction => 'Vulgar fraction form', compat => 'Otherwise unspecified compatibility character' ); my %GENERAL_CATEGORY_VALUES = ( Lu => 'Uppercase_Letter, an uppercase letter', Ll => 'Lowercase_Letter, a lowercase letter', Lt => 'Titlecase_Letter, a digraphic character, with first part uppercase', LC => 'Cased_Letter, Lu | Ll | Lt', Lm => 'Modifier_Letter, a modifier letter', Lo => 'Other_Letter, other letters, including syllables and ideographs', L => 'Letter Lu | Ll | Lt | Lm | Lo', Mn => 'Nonspacing_Mark, a nonspacing combining mark (zero advance width)', Mc => 'Spacing_Mark, a spacing combining mark (positive advance width)', Me => 'Enclosing_Mark, an enclosing combining mark', M => 'Mark, Mn | Mc | Me', Nd => 'Decimal_Number, a decimal digit', Nl => 'Letter_Number, a letterlike numeric character', No => 'Other_Number, a numeric character of other type', N => 'Number Nd | Nl | No', Pc => 'Connector_Punctuation, a connecting punctuation mark, like a tie', Pd => 'Dash_Punctuation, a dash or hyphen punctuation mark', Ps => 'Open_Punctuation, an opening punctuation mark (of a pair)', Pe => 'Close_Punctuation, a closing punctuation mark (of a pair)', Pi => 'Initial_Punctuation, an initial quotation mark', Pf => 'Final_Punctuation, a final quotation mark', Po => 'Other_Punctuation, a punctuation mark of other type', P => 'Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po', Sm => 'Math_Symbol, a symbol of mathematical use', Sc => 'Currency_Symbol, a currency sign', Sk => 'Modifier_Symbol, a non-letterlike modifier symbol', So => 'Other_Symbol, a symbol of other type', S => 'Symbol Sm | Sc | Sk | So', Zs => 'Space_Separator, a space character (of various non-zero widths)', Zl => 'Line_Separator, U+2028 LINE SEPARATOR only', Zp => 'Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only', Z => 'Separator, Zs | Zl | Zp', Cc => 'Control, a C0 or C1 control code', Cf => 'Format, a format control character', Cs => 'Surrogate, a surrogate code point', Co => 'Private_Use, a private-use character', Cn => 'Unassigned, a reserved unassigned code point or a noncharacter', C => 'Other, Cc | Cf | Cs | Co | Cn', );