Package translate :: Package storage :: Module utx
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.utx

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2010 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Manage the Universal Terminology eXchange (UTX) format 
 22   
 23  UTX is a format for terminology exchange, designed it seems with Machine 
 24  Translation (MT) as it's primary consumer.  The format is created by 
 25  the Asia-Pacific Association for Machine Translation (AAMT). 
 26   
 27  It is a bilingual base class derived format with L{UtxFile} 
 28  and L{UtxUnit} providing file and unit level access. 
 29   
 30  The format can manage monolingual dictionaries but these classes don't 
 31  implement that. 
 32   
 33  Specification 
 34  ============= 
 35  The format is implemented according to the v1.0 UTX 
 36  L{specification<http://www.aamt.info/english/utx/utx-simple-1.00-specification-e.pdf>} 
 37   
 38  Format Implementation 
 39  ===================== 
 40  The UTX format is a Tab Seperated Value (TSV) file in UTF-8.  The 
 41  first two lines are headers with subsequent lines containing a 
 42  single source target definition. 
 43   
 44  Encoding 
 45  -------- 
 46  The files are UTF-8 encoded with no BOM and CR+LF line terminators. 
 47  """ 
 48   
 49  import csv 
 50  import sys 
 51  import time 
 52   
 53  from translate.storage import base 
 54   
 55   
56 -class UtxDialect(csv.Dialect):
57 """Describe the properties of an UTX generated TAB-delimited dictionary 58 file.""" 59 delimiter = "\t" 60 # The spec says \r\n but there are older version < 1.0 with just \n 61 # FIXME if we find older specs then lets see if we can support these 62 # differences 63 lineterminator = "\r\n" 64 quoting = csv.QUOTE_NONE 65 if sys.version_info < (2, 5, 0): 66 # We need to define the following items for csv in Python < 2.5 67 # UTX does not quote anything FIXME So why MINIMAL? 68 quoting = csv.QUOTE_MINIMAL 69 doublequote = False 70 skipinitialspace = False 71 escapechar = None 72 quotechar = '"'
73 csv.register_dialect("utx", UtxDialect) 74 75
76 -class UtxHeader:
77 """A UTX header entry 78 79 A UTX header is a single line that looks like this:: 80 #UTX-S <version>; < source language >/< target language>; 81 <date created>; <optional fields (creator, license, etc.)> 82 83 Where:: 84 - UTX-S version is currently 1.00. 85 - Source language/target language: ISO 639, 3166 formats. 86 In the case of monolingual dictionary, target language should be 87 omitted. 88 - Date created: ISO 8601 format 89 - Optional fields (creator, license, etc.) 90 """
91 92
93 -class UtxUnit(base.TranslationUnit):
94 """A UTX dictionary unit""" 95
96 - def __init__(self, source=None):
97 self._dict = {} 98 if source: 99 self.source = source 100 super(UtxUnit, self).__init__(source)
101
102 - def getdict(self):
103 """Get the dictionary of values for a UTX line""" 104 return self._dict
105
106 - def setdict(self, newdict):
107 """Set the dictionary of values for a UTX line 108 109 @param newdict: a new dictionary with UTX line elements 110 @type newdict: Dict 111 """ 112 # TODO First check that the values are OK 113 self._dict = newdict
114 dict = property(getdict, setdict) 115
116 - def _get_field(self, key):
117 if key not in self._dict: 118 return None 119 elif self._dict[key]: 120 return self._dict[key].decode('utf-8') 121 else: 122 return ""
123
124 - def _set_field(self, key, newvalue):
125 # FIXME update the header date 126 if newvalue is None: 127 self._dict[key] = None 128 if isinstance(newvalue, unicode): 129 newvalue = newvalue.encode('utf-8') 130 if not key in self._dict or newvalue != self._dict[key]: 131 self._dict[key] = newvalue
132
133 - def getnotes(self, origin=None):
134 return self._get_field('comment')
135
136 - def addnote(self, text, origin=None, position="append"):
137 currentnote = self._get_field('comment') 138 if position == "append" and currentnote is not None and currentnote != u'': 139 self._set_field('comment', currentnote + '\n' + text) 140 else: 141 self._set_field('comment', text)
142
143 - def removenotes(self):
144 self._set_field('comment', u'')
145
146 - def getsource(self):
147 return self._get_field('src')
148
149 - def setsource(self, newsource):
150 self._rich_source = None 151 return self._set_field('src', newsource)
152 source = property(getsource, setsource) 153
154 - def gettarget(self):
155 return self._get_field('tgt')
156
157 - def settarget(self, newtarget):
158 self._rich_target = None 159 return self._set_field('tgt', newtarget)
160 target = property(gettarget, settarget) 161
162 - def settargetlang(self, newlang):
163 self._dict['target-lang'] = newlang
164 targetlang = property(None, settargetlang) 165
166 - def __str__(self):
167 return str(self._dict)
168
169 - def istranslated(self):
170 return bool(self._dict.get('tgt', None))
171 172
173 -class UtxFile(base.TranslationStore):
174 """A UTX dictionary file""" 175 Name = _("UTX Simple Dictionary") 176 Mimetypes = ["text/x-utx"] 177 Extensions = ["utx"] 178
179 - def __init__(self, inputfile=None, unitclass=UtxUnit):
180 """Construct an UTX dictionary, optionally reading in from 181 inputfile.""" 182 self.UnitClass = unitclass 183 base.TranslationStore.__init__(self, unitclass=unitclass) 184 self.filename = '' 185 self.extension = '' 186 self._fieldnames = ['src', 'tgt', 'src:pos'] 187 self._header = {"version": "1.00", 188 "source_language": "en", 189 "date_created": time.strftime("%FT%TZ%z", time.localtime(time.time()))} 190 if inputfile is not None: 191 self.parse(inputfile)
192
193 - def _read_header(self, header=None):
194 """Read a UTX header""" 195 if header is None: 196 self._fieldnames = ['src', 'tgt', 'src:pos'] 197 # FIXME make the header properly 198 self._header = {"version": "1.00"} 199 return 200 header_lines = [] 201 for line in header.split(UtxDialect.lineterminator): 202 if line.startswith("#"): 203 header_lines.append(line) 204 else: 205 break 206 self._header = {} 207 header_components = [] 208 for line in header_lines[:-1]: 209 header_components += line[1:].split(";") 210 self._header["version"] = header_components[0].replace("UTX-S ", "") 211 languages = header_components[1].strip().split("/") 212 self._header["source_language"] = languages[0] 213 self._header["target_language"] = languages[1] or None 214 self._header["date_created"] = header_components[2].strip() 215 for data in header_components[3:]: 216 key, value = data.strip().split(":") 217 self._header[key] = value.strip() 218 self._fieldnames = header_lines[-1:][0].replace("#", ""). split('\t') 219 return len(header_lines)
220
221 - def _write_header(self):
222 """Create a UTX header""" 223 header = "#UTX-S %(version)s; %(src)s/%(tgt)s; %(date)s" % \ 224 {"version": self._header["version"], 225 "src": self._header["source_language"], 226 "tgt": self._header.get("target_language", ""), 227 "date": self._header["date_created"], 228 } 229 items = [] 230 for key, value in self._header.iteritems(): 231 if key in ["version", "source_language", "target_language", "date_created"]: 232 continue 233 items.append("%s: %s" % (key, value)) 234 if len(items): 235 items = "; ".join(items) 236 header += "; " + items 237 header += UtxDialect.lineterminator 238 header += "#" + "\t".join(self._fieldnames) + UtxDialect.lineterminator 239 return header
240
241 - def getsourcelanguage(self):
242 return self._header.get("source_language", None)
243
244 - def setsourcelanguage(self, sourcelanguage):
245 self._header["source_language"] = sourcelanguage
246
247 - def gettargetlanguage(self):
248 return self._header.get("target_language", None)
249
250 - def settargetlanguage(self, targetlanguage):
251 self._header["target_language"] = targetlanguage
252
253 - def parse(self, input):
254 """parsese the given file or file source string""" 255 if hasattr(input, 'name'): 256 self.filename = input.name 257 elif not getattr(self, 'filename', ''): 258 self.filename = '' 259 if hasattr(input, "read"): 260 tmsrc = input.read() 261 input.close() 262 input = tmsrc 263 try: 264 header_length = self._read_header(input) 265 except: 266 raise base.ParseError("Cannot parse header") 267 lines = csv.DictReader(input.split(UtxDialect.lineterminator)[header_length:], 268 fieldnames=self._fieldnames, 269 dialect="utx") 270 for line in lines: 271 newunit = UtxUnit() 272 newunit.dict = line 273 self.addunit(newunit)
274
275 - def __str__(self):
276 output = csv.StringIO() 277 writer = csv.DictWriter(output, fieldnames=self._fieldnames, 278 dialect="utx") 279 unit_count = 0 280 for unit in self.units: 281 if unit.istranslated(): 282 unit_count += 1 283 writer.writerow(unit.dict) 284 if unit_count == 0: 285 return "" 286 output.reset() 287 return self._write_header() + "".join(output.readlines())
288