Package translate :: Package storage :: Module lisa
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.lisa

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Parent class for LISA standards (TMX, TBX, XLIFF)""" 
 22   
 23  import re 
 24   
 25  try: 
 26      from lxml import etree 
 27      from translate.misc.xml_helpers import getText, getXMLlang, setXMLlang, \ 
 28                                             getXMLspace, setXMLspace, namespaced 
 29  except ImportError, e: 
 30      raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.") 
 31   
 32  from translate.storage import base 
 33  from translate.lang import data 
 34   
 35   
36 -def _findAllMatches(text, re_obj):
37 """generate match objects for all L{re_obj} matches in L{text}.""" 38 start = 0 39 max = len(text) 40 while start < max: 41 m = re_obj.search(text, start) 42 if not m: 43 break 44 yield m 45 start = m.end()
46 47 #TODO: we can now do better with our proper placeables support 48 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', 49 '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)'] 50 re_placeholders = [re.compile(ph) for ph in placeholders] 51 52
53 -def _getPhMatches(text):
54 """return list of regexp matchobjects for with all place holders in the 55 L{text}""" 56 matches = [] 57 for re_ph in re_placeholders: 58 matches.extend(list(_findAllMatches(text, re_ph))) 59 60 # sort them so they come sequentially 61 matches.sort(lambda a, b: cmp(a.start(), b.start())) 62 return matches
63 64
65 -class LISAunit(base.TranslationUnit):
66 """ 67 A single unit in the file. Provisional work is done to make several 68 languages possible. 69 """ 70 71 #The name of the root element of this unit type:(termEntry, tu, trans-unit) 72 rootNode = "" 73 # The name of the per language element of this unit type:(termEntry, tu, 74 # trans-unit) 75 languageNode = "" 76 #The name of the innermost element of this unit type:(term, seg) 77 textNode = "" 78 79 namespace = None 80 _default_xml_space = "preserve" 81 """The default handling of spacing in the absense of an xml:space 82 attribute. 83 84 This is mostly for correcting XLIFF behaviour.""" 85
86 - def __init__(self, source, empty=False, **kwargs):
87 """Constructs a unit containing the given source string""" 88 self._rich_source = None 89 self._rich_target = None 90 if empty: 91 self._state_n = 0 92 return 93 self.xmlelement = etree.Element(self.namespaced(self.rootNode)) 94 #add descrip, note, etc. 95 super(LISAunit, self).__init__(source)
96
97 - def __eq__(self, other):
98 """Compares two units""" 99 if not isinstance(other, LISAunit): 100 return super(LISAunit, self).__eq__(other) 101 languageNodes = self.getlanguageNodes() 102 otherlanguageNodes = other.getlanguageNodes() 103 if len(languageNodes) != len(otherlanguageNodes): 104 return False 105 for i in range(len(languageNodes)): 106 mytext = self.getNodeText(languageNodes[i], 107 getXMLspace(self.xmlelement, 108 self._default_xml_space)) 109 othertext = other.getNodeText(otherlanguageNodes[i], 110 getXMLspace(self.xmlelement, 111 self._default_xml_space)) 112 if mytext != othertext: 113 #TODO:^ maybe we want to take children and notes into account 114 return False 115 return True
116
117 - def namespaced(self, name):
118 """Returns name in Clark notation. 119 120 For example namespaced("source") in an XLIFF document might return:: 121 {urn:oasis:names:tc:xliff:document:1.1}source 122 This is needed throughout lxml. 123 """ 124 return namespaced(self.namespace, name)
125
126 - def set_source_dom(self, dom_node):
127 languageNodes = self.getlanguageNodes() 128 if len(languageNodes) > 0: 129 self.xmlelement.replace(languageNodes[0], dom_node) 130 else: 131 self.xmlelement.append(dom_node)
132
133 - def get_source_dom(self):
134 return self.getlanguageNode(lang=None, index=0)
135 source_dom = property(get_source_dom, set_source_dom) 136
137 - def setsource(self, text, sourcelang='en'):
138 if self._rich_source is not None: 139 self._rich_source = None 140 text = data.forceunicode(text) 141 self.source_dom = self.createlanguageNode(sourcelang, text, "source")
142
143 - def getsource(self):
144 return self.getNodeText(self.source_dom, 145 getXMLspace(self.xmlelement, 146 self._default_xml_space))
147 source = property(getsource, setsource) 148
149 - def set_target_dom(self, dom_node, append=False):
150 languageNodes = self.getlanguageNodes() 151 assert len(languageNodes) > 0 152 if dom_node is not None: 153 if append or len(languageNodes) == 0: 154 self.xmlelement.append(dom_node) 155 else: 156 self.xmlelement.insert(1, dom_node) 157 if not append and len(languageNodes) > 1: 158 self.xmlelement.remove(languageNodes[1])
159
160 - def get_target_dom(self, lang=None):
161 if lang: 162 return self.getlanguageNode(lang=lang) 163 else: 164 return self.getlanguageNode(lang=None, index=1)
165 target_dom = property(get_target_dom) 166
167 - def settarget(self, text, lang='xx', append=False):
168 """Sets the "target" string (second language), or alternatively 169 appends to the list""" 170 #XXX: we really need the language - can't really be optional, and we 171 # need to propagate it 172 if self._rich_target is not None: 173 self._rich_target = None 174 text = data.forceunicode(text) 175 # Firstly deal with reinitialising to None or setting to identical 176 # string 177 if self.gettarget() == text: 178 return 179 languageNode = self.get_target_dom(None) 180 if not text is None: 181 if languageNode is None: 182 languageNode = self.createlanguageNode(lang, text, "target") 183 self.set_target_dom(languageNode, append) 184 else: 185 if self.textNode: 186 terms = languageNode.iter(self.namespaced(self.textNode)) 187 try: 188 languageNode = terms.next() 189 except StopIteration, e: 190 pass 191 languageNode.text = text 192 else: 193 self.set_target_dom(None, False)
194
195 - def gettarget(self, lang=None):
196 """retrieves the "target" text (second entry), or the entry in the 197 specified language, if it exists""" 198 return self.getNodeText(self.get_target_dom(lang), 199 getXMLspace(self.xmlelement, 200 self._default_xml_space))
201 target = property(gettarget, settarget) 202
203 - def createlanguageNode(self, lang, text, purpose=None):
204 """Returns a xml Element setup with given parameters to represent a 205 single language entry. Has to be overridden.""" 206 return None
207
208 - def createPHnodes(self, parent, text):
209 """Create the text node in parent containing all the ph tags""" 210 matches = _getPhMatches(text) 211 if not matches: 212 parent.text = text 213 return 214 215 # Now we know there will definitely be some ph tags 216 start = matches[0].start() 217 pretext = text[:start] 218 if pretext: 219 parent.text = pretext 220 lasttag = parent 221 for i, m in enumerate(matches): 222 #pretext 223 pretext = text[start:m.start()] 224 # this will never happen with the first ph tag 225 if pretext: 226 lasttag.tail = pretext 227 #ph node 228 phnode = etree.SubElement(parent, self.namespaced("ph")) 229 phnode.set("id", str(i+1)) 230 phnode.text = m.group() 231 lasttag = phnode 232 start = m.end() 233 #post text 234 if text[start:]: 235 lasttag.tail = text[start:]
236
237 - def getlanguageNodes(self):
238 """Returns a list of all nodes that contain per language information. 239 """ 240 return list(self.xmlelement.iterchildren(self.namespaced(self.languageNode)))
241
242 - def getlanguageNode(self, lang=None, index=None):
243 """Retrieves a languageNode either by language or by index""" 244 if lang is None and index is None: 245 raise KeyError("No criterea for languageNode given") 246 languageNodes = self.getlanguageNodes() 247 if lang: 248 for set in languageNodes: 249 if getXMLlang(set) == lang: 250 return set 251 else:#have to use index 252 if index >= len(languageNodes): 253 return None 254 else: 255 return languageNodes[index] 256 return None
257
258 - def getNodeText(self, languageNode, xml_space="preserve"):
259 """Retrieves the term from the given languageNode""" 260 if languageNode is None: 261 return None 262 if self.textNode: 263 terms = languageNode.iterdescendants(self.namespaced(self.textNode)) 264 if terms is None: 265 return None 266 else: 267 return getText(terms.next(), xml_space) 268 else: 269 return getText(languageNode, xml_space)
270
271 - def __str__(self):
272 return etree.tostring(self.xmlelement, pretty_print=True, 273 encoding='utf-8')
274
275 - def _set_property(self, name, value):
276 self.xmlelement.attrib[name] = value
277 278 xid = property(lambda self: self.xmlelement.attrib[self.namespaced('xid')], 279 lambda self, value: self._set_property(self.namespaced('xid'), value)) 280 281 rid = property(lambda self: self.xmlelement.attrib[self.namespaced('rid')], 282 lambda self, value: self._set_property(self.namespaced('rid'), value)) 283
284 - def createfromxmlElement(cls, element):
285 term = cls(None, empty=True) 286 term.xmlelement = element 287 return term
288 createfromxmlElement = classmethod(createfromxmlElement)
289 290
291 -class LISAfile(base.TranslationStore):
292 """A class representing a file store for one of the LISA file formats.""" 293 UnitClass = LISAunit 294 #The root node of the XML document: 295 rootNode = "" 296 #The root node of the content section: 297 bodyNode = "" 298 #The XML skeleton to use for empty construction: 299 XMLskeleton = "" 300 301 namespace = None 302
303 - def __init__(self, inputfile=None, sourcelanguage='en', 304 targetlanguage=None, unitclass=None):
305 super(LISAfile, self).__init__(unitclass=unitclass) 306 if inputfile is not None: 307 self.parse(inputfile) 308 assert self.document.getroot().tag == self.namespaced(self.rootNode) 309 else: 310 # We strip out newlines to ensure that spaces in the skeleton 311 # doesn't interfere with the the pretty printing of lxml 312 self.parse(self.XMLskeleton.replace("\n", "")) 313 self.setsourcelanguage(sourcelanguage) 314 self.settargetlanguage(targetlanguage) 315 self.addheader() 316 self._encoding = "UTF-8"
317
318 - def addheader(self):
319 """Method to be overridden to initialise headers, etc.""" 320 pass
321
322 - def namespaced(self, name):
323 """Returns name in Clark notation. 324 325 For example namespaced("source") in an XLIFF document might return:: 326 {urn:oasis:names:tc:xliff:document:1.1}source 327 This is needed throughout lxml. 328 """ 329 return namespaced(self.namespace, name)
330
331 - def initbody(self):
332 """Initialises self.body so it never needs to be retrieved from the 333 XML again.""" 334 self.namespace = self.document.getroot().nsmap.get(None, None) 335 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
336
337 - def addsourceunit(self, source):
338 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? 339 """Adds and returns a new unit with the given string as first entry.""" 340 newunit = self.UnitClass(source) 341 self.addunit(newunit) 342 return newunit
343
344 - def addunit(self, unit, new=True):
345 unit.namespace = self.namespace 346 super(LISAfile, self).addunit(unit) 347 if new: 348 self.body.append(unit.xmlelement)
349
350 - def __str__(self):
351 """Converts to a string containing the file's XML""" 352 return etree.tostring(self.document, pretty_print=True, 353 xml_declaration=True, encoding='utf-8')
354
355 - def parse(self, xml):
356 """Populates this object from the given xml string""" 357 if not hasattr(self, 'filename'): 358 self.filename = getattr(xml, 'name', '') 359 if hasattr(xml, "read"): 360 xml.seek(0) 361 posrc = xml.read() 362 xml = posrc 363 if etree.LXML_VERSION >= (2, 1, 0): 364 #Since version 2.1.0 we can pass the strip_cdata parameter to 365 #indicate that we don't want cdata to be converted to raw XML 366 parser = etree.XMLParser(strip_cdata=False) 367 else: 368 parser = etree.XMLParser() 369 self.document = etree.fromstring(xml, parser).getroottree() 370 self._encoding = self.document.docinfo.encoding 371 self.initbody() 372 assert self.document.getroot().tag == self.namespaced(self.rootNode) 373 for entry in self.document.getroot().iterdescendants(self.namespaced(self.UnitClass.rootNode)): 374 term = self.UnitClass.createfromxmlElement(entry) 375 self.addunit(term, new=False)
376