Package translate :: Package lang :: Module data
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.data

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2011 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module stores information and functionality that relates to plurals.""" 
 23   
 24  import unicodedata 
 25   
 26  from translate.storage.placeables import StringElem 
 27   
 28   
 29  languages = { 
 30  'af': (u'Afrikaans', 2, '(n != 1)'), 
 31  'ak': (u'Akan', 2, 'n > 1'), 
 32  'am': (u'Amharic', 2, 'n > 1'), 
 33  'an': (u'Aragonese', 2, '(n != 1)'), 
 34  'ar': (u'Arabic', 6, 'n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 ? 4 : 5'), 
 35  'arn': (u'Mapudungun; Mapuche', 2, 'n > 1'), 
 36  'ast': (u'Asturian; Bable; Leonese; Asturleonese', 2, '(n != 1)'), 
 37  'az': (u'Azerbaijani', 2, '(n != 1)'), 
 38  'be': (u'Belarusian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 39  'bg': (u'Bulgarian', 2, '(n != 1)'), 
 40  'bn': (u'Bengali', 2, '(n != 1)'), 
 41  'bn_IN': (u'Bengali (India)', 2, '(n != 1)'), 
 42  'bo': (u'Tibetan', 1, '0'), 
 43  'br': (u'Breton', 2, 'n > 1'), 
 44  'bs': (u'Bosnian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 45  'ca': (u'Catalan; Valencian', 2, '(n != 1)'), 
 46  'ca@valencia': (u'Catalan; Valencian (Valencia)', 2, '(n != 1)'), 
 47  'cs': (u'Czech', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
 48  'csb': (u'Kashubian', 3, 'n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 49  'cy': (u'Welsh', 2, '(n==2) ? 1 : 0'), 
 50  'da': (u'Danish', 2, '(n != 1)'), 
 51  'de': (u'German', 2, '(n != 1)'), 
 52  'dz': (u'Dzongkha', 1, '0'), 
 53  'el': (u'Greek, Modern (1453-)', 2, '(n != 1)'), 
 54  'en': (u'English', 2, '(n != 1)'), 
 55  'en_GB': (u'English (United Kingdom)', 2, '(n != 1)'), 
 56  'en_ZA': (u'English (South Africa)', 2, '(n != 1)'), 
 57  'eo': (u'Esperanto', 2, '(n != 1)'), 
 58  'es': (u'Spanish; Castilian', 2, '(n != 1)'), 
 59  'et': (u'Estonian', 2, '(n != 1)'), 
 60  'eu': (u'Basque', 2, '(n != 1)'), 
 61  'fa': (u'Persian', 1, '0'), 
 62  'fi': (u'Finnish', 2, '(n != 1)'), 
 63  'fil': (u'Filipino; Pilipino', 2, '(n > 1)'), 
 64  'fo': (u'Faroese', 2, '(n != 1)'), 
 65  'fr': (u'French', 2, '(n > 1)'), 
 66  'fur': (u'Friulian', 2, '(n != 1)'), 
 67  'fy': (u'Frisian', 2, '(n != 1)'), 
 68  'ga': (u'Irish', 3, 'n==1 ? 0 : n==2 ? 1 : 2'), 
 69  'gd': (u'Gaelic; Scottish Gaelic', 2, 'nplurals=4; plural=(n==1 || n==11) ? 0 : (n==2 || n==12) ? 1 : (n > 2 && n < 20) ? 2 : 3'), 
 70  'gl': (u'Galician', 2, '(n != 1)'), 
 71  'gu': (u'Gujarati', 2, '(n != 1)'), 
 72  'gun': (u'Gun', 2, '(n > 1)'), 
 73  'ha': (u'Hausa', 2, '(n != 1)'), 
 74  'he': (u'Hebrew', 2, '(n != 1)'), 
 75  'hi': (u'Hindi', 2, '(n != 1)'), 
 76  'hy': (u'Armenian', 1, '0'), 
 77  'hr': (u'Croatian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 78  'hu': (u'Hungarian', 2, '(n != 1)'), 
 79  'ia': (u"Interlingua (International Auxiliary Language Association)", 2, '(n != 1)'), 
 80  'id': (u'Indonesian', 1, '0'), 
 81  'is': (u'Icelandic', 2, '(n != 1)'), 
 82  'it': (u'Italian', 2, '(n != 1)'), 
 83  'ja': (u'Japanese', 1, '0'), 
 84  'jv': (u'Javanese', 2, '(n != 1)'), 
 85  'ka': (u'Georgian', 1, '0'), 
 86  'kk': (u'Kazakh', 1, '0'), 
 87  'km': (u'Central Khmer', 1, '0'), 
 88  'kn': (u'Kannada', 2, '(n != 1)'), 
 89  'ko': (u'Korean', 1, '0'), 
 90  'ku': (u'Kurdish', 2, '(n != 1)'), 
 91  'kw': (u'Cornish', 4, '(n==1) ? 0 : (n==2) ? 1 : (n == 3) ? 2 : 3'), 
 92  'ky': (u'Kirghiz; Kyrgyz', 1, '0'), 
 93  'lb': (u'Luxembourgish; Letzeburgesch', 2, '(n != 1)'), 
 94  'ln': (u'Lingala', 2, '(n > 1)'), 
 95  'lo': (u'Lao', 1, '0'), 
 96  'lt': (u'Lithuanian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 97  'lv': (u'Latvian', 3, '(n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2)'), 
 98  'mai': (u'Maithili', 2, '(n != 1)'), 
 99  'mfe': (u'Morisyen', 2, '(n > 1)'), 
100  'mg': (u'Malagasy', 2, '(n > 1)'), 
101  'mi': (u'Maori', 2, '(n > 1)'), 
102  'mk': (u'Macedonian', 2, 'n==1 || n%10==1 ? 0 : 1'), 
103  'ml': (u'Malayalam', 2, '(n != 1)'), 
104  'mn': (u'Mongolian', 2, '(n != 1)'), 
105  'mr': (u'Marathi', 2, '(n != 1)'), 
106  'ms': (u'Malay', 1, '0'), 
107  'mt': (u'Maltese', 4, '(n==1 ? 0 : n==0 || ( n%100>1 && n%100<11) ? 1 : (n%100>10 && n%100<20 ) ? 2 : 3)'), 
108  'nah': (u'Nahuatl languages', 2, '(n != 1)'), 
109  'nap': (u'Neapolitan', 2, '(n != 1)'), 
110  'nb': (u'Bokmål, Norwegian; Norwegian Bokmål', 2, '(n != 1)'), 
111  'ne': (u'Nepali', 2, '(n != 1)'), 
112  'nl': (u'Dutch; Flemish', 2, '(n != 1)'), 
113  'nn': (u'Norwegian Nynorsk; Nynorsk, Norwegian', 2, '(n != 1)'), 
114  'nso': (u'Pedi; Sepedi; Northern Sotho', 2, '(n != 1)'), 
115  'oc': (u'Occitan (post 1500)', 2, '(n > 1)'), 
116  'or': (u'Oriya', 2, '(n != 1)'), 
117  'pa': (u'Panjabi; Punjabi', 2, '(n != 1)'), 
118  'pap': (u'Papiamento', 2, '(n != 1)'), 
119  'pl': (u'Polish', 3, '(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
120  'pms': (u'Piemontese', 2, '(n != 1)'), 
121  'ps': (u'Pushto; Pashto', 2, '(n != 1)'), 
122  'pt': (u'Portuguese', 2, '(n != 1)'), 
123  'pt_BR': (u'Portuguese (Brazil)', 2, '(n > 1)'), 
124  'rm': (u'Romansh', 2, '(n != 1)'), 
125  'ro': (u'Romanian', 3, '(n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2);'), 
126  'ru': (u'Russian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
127  'sco': (u'Scots', 2, '(n != 1)'), 
128  'si': (u'Sinhala; Sinhalese', 2, '(n != 1)'), 
129  'sk': (u'Slovak', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
130  'sl': (u'Slovenian', 4, '(n%100==1 ? 0 : n%100==2 ? 1 : n%100==3 || n%100==4 ? 2 : 3)'), 
131  'so': (u'Somali', 2, '(n != 1)'), 
132  'sq': (u'Albanian', 2, '(n != 1)'), 
133  'sr': (u'Serbian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
134  'st': (u'Sotho, Southern', 2, '(n != 1)'), 
135  'su': (u'Sundanese', 1, '0'), 
136  'sv': (u'Swedish', 2, '(n != 1)'), 
137  'sw': (u'Swahili', 2, '(n != 1)'), 
138  'ta': (u'Tamil', 2, '(n != 1)'), 
139  'te': (u'Telugu', 2, '(n != 1)'), 
140  'tg': (u'Tajik', 2, '(n != 1)'), 
141  'ti': (u'Tigrinya', 2, '(n > 1)'), 
142  'th': (u'Thai', 1, '0'), 
143  'tk': (u'Turkmen', 2, '(n != 1)'), 
144  'tr': (u'Turkish', 1, '0'), 
145  'tt': (u'Tatar', 1, '0'), 
146  'ug': (u'Uighur; Uyghur', 1, '0'), 
147  'uk': (u'Ukrainian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
148  'vi': (u'Vietnamese', 1, '0'), 
149  'wa': (u'Walloon', 2, '(n > 1)'), 
150  # Chinese is difficult because the main divide is on script, not really 
151  # country. Simplified Chinese is used mostly in China, Singapore and Malaysia. 
152  # Traditional Chinese is used mostly in Hong Kong, Taiwan and Macau. 
153  'zh_CN': (u'Chinese (China)', 1, '0'), 
154  'zh_HK': (u'Chinese (Hong Kong)', 1, '0'), 
155  'zh_TW': (u'Chinese (Taiwan)', 1, '0'), 
156  'zu': (u'Zulu', 2, '(n != 1)'), 
157  } 
158  """Dictionary of language data. 
159  The language code is the dictionary key (which may contain country codes and modifiers). 
160  The value is a tuple: (Full name in English from iso-codes, nplurals, plural equation). 
161   
162  Note that the English names should not be used in user facing places - it 
163  should always be passed through the function returned from tr_lang(), or at 
164  least passed through _fix_language_name().""" 
165   
166  _fixed_names = { 
167          u"Asturian; Bable; Leonese; Asturleonese": u"Asturian", 
168          u"Bokmål, Norwegian; Norwegian Bokmål": u"Norwegian Bokmål", 
169          u"Catalan; Valencian": u"Catalan", 
170          u"Central Khmer": u"Khmer", 
171          u"Chichewa; Chewa; Nyanja": u"Chewa; Nyanja", 
172          u"Divehi; Dhivehi; Maldivian": u"Divehi", 
173          u"Dutch; Flemish": u"Dutch", 
174          u"Filipino; Pilipino": u"Filipino", 
175          u"Gaelic; Scottish Gaelic": u"Scottish Gaelic", 
176          u"Greek, Modern (1453-)": u"Greek", 
177          u"Interlingua (International Auxiliary Language Association)": u"Interlingua", 
178          u"Kirghiz; Kyrgyz": u"Kirghiz", 
179          u"Klingon; tlhIngan-Hol": u"Klingon", 
180          u"Limburgan; Limburger; Limburgish": u"Limburgish", 
181          u"Low German; Low Saxon; German, Low; Saxon, Low": u"Low German", 
182          u"Luxembourgish; Letzeburgesch": u"Luxembourgish", 
183          u"Ndebele, South; South Ndebele": u"Southern Ndebele", 
184          u"Norwegian Nynorsk; Nynorsk, Norwegian": u"Norwegian Nynorsk", 
185          u"Occitan (post 1500)": u"Occitan", 
186          u"Panjabi; Punjabi": u"Punjabi", 
187          u"Pedi; Sepedi; Northern Sotho": u"Northern Sotho", 
188          u"Pushto; Pashto": u"Pashto", 
189          u"Sinhala; Sinhalese": u"Sinhala", 
190          u"Sotho, Southern": u"Sotho", 
191          u"Spanish; Castilian": u"Spanish", 
192          u"Uighur; Uyghur": u"Uighur", 
193  } 
194   
195   
196 -def simplercode(code):
197 """This attempts to simplify the given language code by ignoring country 198 codes, for example. 199 200 @see: 201 - U{http://www.rfc-editor.org/rfc/bcp/bcp47.txt} 202 - U{http://www.rfc-editor.org/rfc/rfc4646.txt} 203 - U{http://www.rfc-editor.org/rfc/rfc4647.txt} 204 - U{http://www.w3.org/International/articles/language-tags/} 205 """ 206 if not code: 207 return code 208 209 normalized = normalize_code(code) 210 separator = normalized.rfind('-') 211 if separator >= 0: 212 return code[:separator] 213 else: 214 return ""
215 216 217 expansion_factors = { 218 'af': 0.1, 219 'ar': -0.09, 220 'es': 0.21, 221 'fr': 0.28, 222 'it': 0.2, 223 } 224 """Source to target string length expansion factors.""" 225 226 import gettext 227 import locale 228 import re 229 import os 230 231 iso639 = {} 232 """ISO 639 language codes""" 233 iso3166 = {} 234 """ISO 3166 country codes""" 235 236 langcode_re = re.compile("^[a-z]{2,3}([_-][A-Z]{2,3}|)(@[a-zA-Z0-9]+|)$") 237 langcode_ire = re.compile("^[a-z]{2,3}([_-][a-z]{2,3})?(@[a-z0-9]+)?$", re.IGNORECASE) 238 variant_re = re.compile("^[_-][A-Z]{2,3}(@[a-zA-Z0-9]+|)$") 239 240
241 -def languagematch(languagecode, otherlanguagecode):
242 """matches a languagecode to another, ignoring regions in the second""" 243 if languagecode is None: 244 return langcode_re.match(otherlanguagecode) 245 return languagecode == otherlanguagecode or \ 246 (otherlanguagecode.startswith(languagecode) and variant_re.match(otherlanguagecode[len(languagecode):]))
247 248 dialect_name_re = re.compile(r"(.+)\s\(([^)\d]{,25})\)$") 249 # The limit of 25 characters on the country name is so that "Interlingua (...)" 250 # (see above) is correctly interpreted. 251 252
253 -def tr_lang(langcode=None):
254 """Gives a function that can translate a language name, even in the form C{"language (country)"}, 255 into the language with iso code langcode, or the system language if no language is specified.""" 256 langfunc = gettext_lang(langcode) 257 countryfunc = gettext_country(langcode) 258 259 def handlelanguage(name): 260 match = dialect_name_re.match(name) 261 if match: 262 language, country = match.groups() 263 return u"%s (%s)" % (_fix_language_name(langfunc(language)), countryfunc(country)) 264 else: 265 return _fix_language_name(langfunc(name))
266 267 return handlelanguage 268 269
270 -def _fix_language_name(name):
271 """Identify and replace some unsightly names present in iso-codes. 272 273 If the name is present in _fixed_names we assume it is untranslated and 274 we replace it with a more usable rendering. If the remaining part is long 275 and includes a semi-colon, we only take the text up to the semi-colon to 276 keep things neat.""" 277 if name in _fixed_names: 278 return _fixed_names[name] 279 elif len(name) > 11: 280 # These constants are somewhat arbitrary, but testing with the Japanese 281 # translation of ISO codes suggests these as the upper bounds. 282 split_point = name[5:].find(u';') 283 if split_point >= 0: 284 return name[:5+split_point] 285 return name
286 287
288 -def gettext_lang(langcode=None):
289 """Returns a gettext function to translate language names into the given 290 language, or the system language if no language is specified.""" 291 if not langcode in iso639: 292 if not langcode: 293 langcode = "" 294 if os.name == "nt": 295 # On Windows the default locale is not used for some reason 296 t = gettext.translation('iso_639', languages=[locale.getdefaultlocale()[0]], fallback=True) 297 else: 298 t = gettext.translation('iso_639', fallback=True) 299 else: 300 t = gettext.translation('iso_639', languages=[langcode], fallback=True) 301 iso639[langcode] = t.ugettext 302 return iso639[langcode]
303 304
305 -def gettext_country(langcode=None):
306 """Returns a gettext function to translate country names into the given 307 language, or the system language if no language is specified.""" 308 if not langcode in iso3166: 309 if not langcode: 310 langcode = "" 311 if os.name == "nt": 312 # On Windows the default locale is not used for some reason 313 t = gettext.translation('iso_3166', languages=[locale.getdefaultlocale()[0]], fallback=True) 314 else: 315 t = gettext.translation('iso_3166', fallback=True) 316 else: 317 t = gettext.translation('iso_3166', languages=[langcode], fallback=True) 318 iso3166[langcode] = t.ugettext 319 return iso3166[langcode]
320 321
322 -def normalize(string, normal_form="NFC"):
323 """Return a unicode string in its normalized form 324 325 @param string: The string to be normalized 326 @param normal_form: NFC (default), NFD, NFKC, NFKD 327 @return: Normalized string 328 """ 329 if string is None: 330 return None 331 else: 332 return unicodedata.normalize(normal_form, string)
333 334
335 -def forceunicode(string):
336 """Ensures that the string is in unicode. 337 338 @param string: A text string 339 @type string: Unicode, String 340 @return: String converted to Unicode and normalized as needed. 341 @rtype: Unicode 342 """ 343 if string is None: 344 return None 345 if isinstance(string, str): 346 encoding = getattr(string, "encoding", "utf-8") 347 string = string.decode(encoding) 348 elif isinstance(string, StringElem): 349 string = unicode(string) 350 return string
351 352
353 -def normalized_unicode(string):
354 """Forces the string to unicode and does normalization.""" 355 return normalize(forceunicode(string))
356 357
358 -def normalize_code(code):
359 if not code: 360 return code 361 return code.replace("_", "-").replace("@", "-").lower()
362 363
364 -def simplify_to_common(language_code, languages=languages):
365 """Simplify language code to the most commonly used form for the 366 language, stripping country information for languages that tend 367 not to be localized differently for different countries""" 368 simpler = simplercode(language_code) 369 if normalize_code(language_code) in [normalize_code(key) for key in languages.keys()] or simpler == "": 370 return language_code 371 else: 372 return simplify_to_common(simpler)
373