Package translate :: Package search :: Module match
[hide private]
[frames] | no frames]

Source Code for Module translate.search.match

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Class to perform translation memory matching from a store of translation units""" 
 22   
 23  import heapq 
 24  import re 
 25   
 26  from translate.search import lshtein 
 27  from translate.search import terminology 
 28  from translate.storage import base 
 29  from translate.storage import po 
 30  from translate.misc.multistring import multistring 
 31   
 32   
33 -def sourcelen(unit):
34 """Returns the length of the source string""" 35 return len(unit.source)
36 37
38 -def _sort_matches(matches, match_info):
39 40 def _matches_cmp(x, y): 41 # This function will sort a list of matches according to the match's starting 42 # position, putting the one with the longer source text first, if two are the same. 43 c = cmp(match_info[x.source]['pos'], match_info[y.source]['pos']) 44 return c and c or cmp(len(y.source), len(x.source))
45 matches.sort(_matches_cmp) 46 47
48 -class matcher(object):
49 """A class that will do matching and store configuration for the matching process""" 50 51 sort_reverse = False 52
53 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
54 """max_candidates is the maximum number of candidates that should be assembled, 55 min_similarity is the minimum similarity that must be attained to be included in 56 the result, comparer is an optional Comparer with similarity() function""" 57 if comparer is None: 58 comparer = lshtein.LevenshteinComparer(max_length) 59 self.comparer = comparer 60 self.setparameters(max_candidates, min_similarity, max_length) 61 self.usefuzzy = usefuzzy 62 self.inittm(store) 63 self.addpercentage = True
64
65 - def usable(self, unit):
66 """Returns whether this translation unit is usable for TM""" 67 #TODO: We might want to consider more attributes, such as approved, reviewed, etc. 68 source = unit.source 69 target = unit.target 70 if source and target and (self.usefuzzy or not unit.isfuzzy()): 71 if len(source) < 2: 72 return False 73 if source in self.existingunits and self.existingunits[source] == target: 74 return False 75 else: 76 self.existingunits[source] = target 77 return True 78 return False
79
80 - def inittm(self, stores, reverse=False):
81 """Initialises the memory for later use. We use simple base units for 82 speedup.""" 83 # reverse is deprectated - just use self.sort_reverse 84 self.existingunits = {} 85 self.candidates = base.TranslationStore() 86 87 if isinstance(stores, base.TranslationStore): 88 stores = [stores] 89 for store in stores: 90 self.extendtm(store.units, store=store, sort=False) 91 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
92 # print "TM initialised with %d candidates (%d to %d characters long)" % \ 93 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) 94
95 - def extendtm(self, units, store=None, sort=True):
96 """Extends the memory with extra unit(s). 97 98 @param units: The units to add to the TM. 99 @param store: Optional store from where some metadata can be retrieved 100 and associated with each unit. 101 @param sort: Optional parameter that can be set to False to supress 102 sorting of the candidates list. This should probably only be used in 103 inittm(). 104 """ 105 if isinstance(units, base.TranslationUnit): 106 units = [units] 107 candidates = filter(self.usable, units) 108 for candidate in candidates: 109 simpleunit = base.TranslationUnit("") 110 # We need to ensure that we don't pass multistrings futher, since 111 # some modules (like the native Levenshtein) can't use it. 112 if isinstance(candidate.source, multistring): 113 if len(candidate.source.strings) > 1: 114 simpleunit.orig_source = candidate.source 115 simpleunit.orig_target = candidate.target 116 simpleunit.source = unicode(candidate.source) 117 simpleunit.target = unicode(candidate.target) 118 else: 119 simpleunit.source = candidate.source 120 simpleunit.target = candidate.target 121 # If we now only get translator comments, we don't get programmer 122 # comments in TM suggestions (in Pootle, for example). If we get all 123 # notes, pot2po adds all previous comments as translator comments 124 # in the new po file 125 simpleunit.addnote(candidate.getnotes(origin="translator")) 126 simpleunit.fuzzy = candidate.isfuzzy() 127 self.candidates.units.append(simpleunit) 128 if sort: 129 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
130
131 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
132 """Sets the parameters without reinitialising the tm. If a parameter 133 is not specified, it is set to the default, not ignored""" 134 self.MAX_CANDIDATES = max_candidates 135 self.MIN_SIMILARITY = min_similarity 136 self.MAX_LENGTH = max_length
137
138 - def getstoplength(self, min_similarity, text):
139 """Calculates a length beyond which we are not interested. 140 The extra fat is because we don't use plain character distance only.""" 141 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
142
143 - def getstartlength(self, min_similarity, text):
144 """Calculates the minimum length we are interested in. 145 The extra fat is because we don't use plain character distance only.""" 146 return max(len(text) * (min_similarity/100.0), 1)
147
148 - def matches(self, text):
149 """Returns a list of possible matches for given source text. 150 151 @type text: String 152 @param text: The text that will be search for in the translation memory 153 @rtype: list 154 @return: a list of units with the source and target strings from the 155 translation memory. If self.addpercentage is true (default) the match 156 quality is given as a percentage in the notes. 157 """ 158 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES 159 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates 160 #that are better, we can adjust min_similarity upwards for speedup 161 min_similarity = self.MIN_SIMILARITY 162 163 # We want to limit our search in self.candidates, so we want to ignore 164 # all units with a source string that is too short or too long. We use 165 # a binary search to find the shortest string, from where we start our 166 # search in the candidates. 167 168 # minimum source string length to be considered 169 startlength = self.getstartlength(min_similarity, text) 170 startindex = 0 171 endindex = len(self.candidates.units) 172 while startindex < endindex: 173 mid = (startindex + endindex) // 2 174 if sourcelen(self.candidates.units[mid]) < startlength: 175 startindex = mid + 1 176 else: 177 endindex = mid 178 179 # maximum source string length to be considered 180 stoplength = self.getstoplength(min_similarity, text) 181 lowestscore = 0 182 183 for candidate in self.candidates.units[startindex:]: 184 cmpstring = candidate.source 185 if len(cmpstring) > stoplength: 186 break 187 similarity = self.comparer.similarity(text, cmpstring, min_similarity) 188 if similarity < min_similarity: 189 continue 190 if similarity > lowestscore: 191 heapq.heapreplace(bestcandidates, (similarity, candidate)) 192 lowestscore = bestcandidates[0][0] 193 if lowestscore >= 100: 194 break 195 if min_similarity < lowestscore: 196 min_similarity = lowestscore 197 stoplength = self.getstoplength(min_similarity, text) 198 199 #Remove the empty ones: 200 def notzero(item): 201 score = item[0] 202 return score != 0
203 bestcandidates = filter(notzero, bestcandidates) 204 #Sort for use as a general list, and reverse so the best one is at index 0 205 bestcandidates.sort(reverse=True) 206 return self.buildunits(bestcandidates)
207
208 - def buildunits(self, candidates):
209 """Builds a list of units conforming to base API, with the score in the comment""" 210 units = [] 211 for score, candidate in candidates: 212 if hasattr(candidate, "orig_source"): 213 candidate.source = candidate.orig_source 214 candidate.target = candidate.orig_target 215 newunit = po.pounit(candidate.source) 216 newunit.target = candidate.target 217 newunit.markfuzzy(candidate.fuzzy) 218 candidatenotes = candidate.getnotes().strip() 219 if candidatenotes: 220 newunit.addnote(candidatenotes) 221 if self.addpercentage: 222 newunit.addnote("%d%%" % score) 223 units.append(newunit) 224 return units
225 226 227 # We don't want to miss certain forms of words that only change a little 228 # at the end. Now we are tying this code to English, but it should serve 229 # us well. For example "category" should be found in "categories", 230 # "copy" should be found in "copied" 231 # 232 # The tuples define a regular expression to search for, and with what it 233 # should be replaced. 234 ignorepatterns = [ 235 ("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied 236 ("[\s-]+", ""), #down time / downtime, pre-order / preorder 237 ("-", " "), #pre-order / pre order 238 (" ", "-"), #pre order / pre-order 239 ] 240 ignorepatterns_re = [(re.compile(a), b) for (a, b) in ignorepatterns] 241 242 context_re = re.compile("\s+\(.*\)\s*$") 243 244
245 -class terminologymatcher(matcher):
246 """A matcher with settings specifically for terminology matching""" 247 248 sort_reverse = True 249
250 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
251 if comparer is None: 252 comparer = terminology.TerminologyComparer(max_length) 253 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 254 self.addpercentage = False 255 self.match_info = {}
256
257 - def inittm(self, store):
258 """Normal initialisation, but convert all source strings to lower case""" 259 matcher.inittm(self, store) 260 extras = [] 261 for unit in self.candidates.units: 262 source = unit.source = context_re.sub("", unit.source).lower() 263 for ignorepattern_re, replacement in ignorepatterns_re: 264 (newterm, occurrences) = ignorepattern_re.subn(replacement, source) 265 if occurrences: 266 new_unit = type(unit).buildfromunit(unit) 267 new_unit.source = newterm 268 # We mark it fuzzy to indicate that it isn't pristine 269 unit.markfuzzy() 270 extras.append(new_unit) 271 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) 272 if extras: 273 # We don't sort, so that the altered forms are at the back and 274 # considered last. 275 self.extendtm(extras, sort=False)
276
277 - def getstartlength(self, min_similarity, text):
278 # Let's number false matches by not working with terms of two 279 # characters or less 280 return 3
281
282 - def getstoplength(self, min_similarity, text):
283 # Let's ignore terms with more than 50 characters. Perhaps someone 284 # gave a file with normal (long) translations 285 return 50
286
287 - def usable(self, unit):
288 """Returns whether this translation unit is usable for terminology.""" 289 if not unit.istranslated(): 290 return False 291 l = len(context_re.sub("", unit.source)) 292 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
293
294 - def matches(self, text):
295 """Normal matching after converting text to lower case. Then replace 296 with the original unit to retain comments, etc.""" 297 text = text.lower() 298 comparer = self.comparer 299 comparer.match_info = {} 300 match_info = {} 301 matches = [] 302 known = set() 303 for cand in self.candidates.units: 304 source = cand.source 305 if (source, cand.target) in known: 306 continue 307 if comparer.similarity(text, source, self.MIN_SIMILARITY): 308 match_info[source] = {'pos': comparer.match_info[source]['pos']} 309 matches.append(cand) 310 known.add((source, cand.target)) 311 312 final_matches = [] 313 lastend = 0 314 _sort_matches(matches, match_info) 315 for match in matches: 316 start_pos = match_info[match.source]['pos'] 317 if start_pos < lastend: 318 continue 319 end = start_pos + len(match.source) 320 321 final_matches.append(match) 322 323 # Get translations for the placeable 324 for m in matches: 325 if m is match: 326 continue 327 m_info = match_info[m.source] 328 m_end = m_info['pos'] 329 if m_end > start_pos: 330 # we past valid possibilities in the list 331 break 332 m_end += len(m.source) 333 if start_pos == m_info['pos'] and end == m_end: 334 # another match for the same term 335 final_matches.append(m) 336 337 lastend = end 338 if final_matches: 339 self.match_info = match_info 340 return final_matches
341 342 343 # utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries
344 -def unit2dict(unit):
345 """converts a pounit to a simple dict structure for use over the web""" 346 return {"source": unit.source, "target": unit.target, 347 "quality": _parse_quality(unit.getnotes()), "context": unit.getcontext()}
348 349
350 -def _parse_quality(comment):
351 """extracts match quality from po comments""" 352 quality = re.search('([0-9]+)%', comment) 353 if quality: 354 return quality.group(1)
355