1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import re
from difflib import SequenceMatcher
from xml import sax
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from compare_locales.parser import DTDParser, PropertiesParser
class Checker(object):
'''Abstract class to implement checks per file type.
'''
pattern = None
@classmethod
def use(cls, file):
return cls.pattern.match(file.file)
def check(self, refEnt, l10nEnt):
'''Given the reference and localized Entities, performs checks.
This is a generator yielding tuples of
- "warning" or "error", depending on what should be reported,
- tuple of line, column info for the error within the string
- description string to be shown in the report
'''
if True:
raise NotImplementedError("Need to subclass")
yield ("error", (0, 0), "This is an example error", "example")
class PrintfException(Exception):
def __init__(self, msg, pos):
self.pos = pos
self.msg = msg
class PropertiesChecker(Checker):
'''Tests to run on .properties files.
'''
pattern = re.compile('.*\.properties$')
printf = re.compile(r'%(?P<good>%|'
r'(?:(?P<number>[1-9][0-9]*)\$)?'
r'(?P<width>\*|[0-9]+)?'
r'(?P<prec>\.(?:\*|[0-9]+)?)?'
r'(?P<spec>[duxXosScpfg]))?')
def check(self, refEnt, l10nEnt):
'''Test for the different variable formats.
'''
refValue, l10nValue = refEnt.val, l10nEnt.val
refSpecs = None
# check for PluralForm.jsm stuff, should have the docs in the
# comment
if 'Localization_and_Plurals' in refEnt.pre_comment:
# For plurals, common variable pattern is #1. Try that.
pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
refValue))
if len(pats) == 0:
return
lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
l10nValue))
if pats - lpats:
yield ('warning', 0, 'not all variables used in l10n',
'plural')
return
if lpats - pats:
yield ('error', 0, 'unreplaced variables in l10n',
'plural')
return
return
# check for lost escapes
raw_val = l10nEnt.raw_val
for m in PropertiesParser.escape.finditer(raw_val):
if m.group('single') and \
m.group('single') not in PropertiesParser.known_escapes:
yield ('warning', m.start(),
'unknown escape sequence, \\' + m.group('single'),
'escape')
try:
refSpecs = self.getPrintfSpecs(refValue)
except PrintfException:
refSpecs = []
if refSpecs:
for t in self.checkPrintf(refSpecs, l10nValue):
yield t
return
def checkPrintf(self, refSpecs, l10nValue):
try:
l10nSpecs = self.getPrintfSpecs(l10nValue)
except PrintfException, e:
yield ('error', e.pos, e.msg, 'printf')
return
if refSpecs != l10nSpecs:
sm = SequenceMatcher()
sm.set_seqs(refSpecs, l10nSpecs)
msgs = []
warn = None
for action, i1, i2, j1, j2 in sm.get_opcodes():
if action == 'equal':
continue
if action == 'delete':
# missing argument in l10n
if i2 == len(refSpecs):
# trailing specs missing, that's just a warning
warn = ', '.join('trailing argument %d `%s` missing' %
(i+1, refSpecs[i])
for i in xrange(i1, i2))
else:
for i in xrange(i1, i2):
msgs.append('argument %d `%s` missing' %
(i+1, refSpecs[i]))
continue
if action == 'insert':
# obsolete argument in l10n
for i in xrange(j1, j2):
msgs.append('argument %d `%s` obsolete' %
(i+1, l10nSpecs[i]))
continue
if action == 'replace':
for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
msgs.append('argument %d `%s` should be `%s`' %
(j+1, l10nSpecs[j], refSpecs[i]))
if msgs:
yield ('error', 0, ', '.join(msgs), 'printf')
if warn is not None:
yield ('warning', 0, warn, 'printf')
def getPrintfSpecs(self, val):
hasNumber = False
specs = []
for m in self.printf.finditer(val):
if m.group("good") is None:
# found just a '%', signal an error
raise PrintfException('Found single %', m.start())
if m.group("good") == '%':
# escaped %
continue
if ((hasNumber and m.group('number') is None) or
(not hasNumber and specs and
m.group('number') is not None)):
# mixed style, numbered and not
raise PrintfException('Mixed ordered and non-ordered args',
m.start())
hasNumber = m.group('number') is not None
if hasNumber:
pos = int(m.group('number')) - 1
ls = len(specs)
if pos >= ls:
# pad specs
nones = pos - ls
specs[ls:pos] = nones*[None]
specs.append(m.group('spec'))
else:
if specs[pos] is not None:
raise PrintfException('Double ordered argument %d' %
(pos+1),
m.start())
specs[pos] = m.group('spec')
else:
specs.append(m.group('spec'))
# check for missing args
if hasNumber and not all(specs):
raise PrintfException('Ordered argument missing', 0)
return specs
class DTDChecker(Checker):
"""Tests to run on DTD files.
Uses xml.sax for the heavy lifting of xml parsing.
The code tries to parse until it doesn't find any unresolved entities
anymore. If it finds one, it tries to grab the key, and adds an empty
<!ENTITY key ""> definition to the header.
Also checks for some CSS and number heuristics in the values.
"""
pattern = re.compile('.*\.dtd$')
eref = re.compile('&(%s);' % DTDParser.Name)
tmpl = '''<!DOCTYPE elem [%s]>
<elem>%s</elem>
'''
xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))
def __init__(self, reference):
self.reference = reference
self.__known_entities = None
def known_entities(self, refValue):
if self.__known_entities is None and self.reference is not None:
self.__known_entities = set()
for ent in self.reference:
self.__known_entities.update(self.entities_for_value(ent.val))
return self.__known_entities if self.__known_entities is not None \
else self.entities_for_value(refValue)
def entities_for_value(self, value):
reflist = set(m.group(1).encode('utf-8')
for m in self.eref.finditer(value))
reflist -= self.xmllist
return reflist
# Setup for XML parser, with default and text-only content handler
class TextContent(sax.handler.ContentHandler):
textcontent = ''
def characters(self, content):
self.textcontent += content
defaulthandler = sax.handler.ContentHandler()
texthandler = TextContent()
numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
num = re.compile('^%s$' % numPattern)
lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
length = re.compile('^%s$' % lengthPattern)
spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
lengthPattern)
style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
{'spec': spec.pattern})
processContent = None
def check(self, refEnt, l10nEnt):
"""Try to parse the refvalue inside a dummy element, and keep
track of entities that we need to define to make that work.
Return a checker that offers just those entities.
"""
refValue, l10nValue = refEnt.val, l10nEnt.val
# find entities the refValue references,
# reusing markup from DTDParser.
reflist = self.known_entities(refValue)
inContext = self.entities_for_value(refValue)
entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
parser = sax.make_parser()
parser.setFeature(sax.handler.feature_external_ges, False)
parser.setContentHandler(self.defaulthandler)
try:
parser.parse(StringIO(self.tmpl %
(entities, refValue.encode('utf-8'))))
# also catch stray %
parser.parse(StringIO(self.tmpl %
(refEnt.all.encode('utf-8') + entities,
'&%s;' % refEnt.key.encode('utf-8'))))
except sax.SAXParseException, e:
yield ('warning',
(0, 0),
"can't parse en-US value", 'xmlparse')
# find entities the l10nValue references,
# reusing markup from DTDParser.
l10nlist = self.entities_for_value(l10nValue)
missing = sorted(l10nlist - reflist)
_entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
if self.processContent is not None:
self.texthandler.textcontent = ''
parser.setContentHandler(self.texthandler)
try:
parser.parse(StringIO(self.tmpl % (_entities,
l10nValue.encode('utf-8'))))
# also catch stray %
# if this fails, we need to substract the entity definition
parser.setContentHandler(self.defaulthandler)
parser.parse(StringIO(self.tmpl % (
l10nEnt.all.encode('utf-8') + _entities,
'&%s;' % l10nEnt.key.encode('utf-8'))))
except sax.SAXParseException, e:
# xml parse error, yield error
# sometimes, the error is reported on our fake closing
# element, make that the end of the last line
lnr = e.getLineNumber() - 1
lines = l10nValue.splitlines()
if lnr > len(lines):
lnr = len(lines)
col = len(lines[lnr-1])
else:
col = e.getColumnNumber()
if lnr == 1:
# first line starts with <elem>, substract
col -= len("<elem>")
elif lnr == 0:
col -= len("<!DOCTYPE elem [") # first line is DOCTYPE
yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')
warntmpl = u'Referencing unknown entity `%s`'
if reflist:
if inContext:
elsewhere = reflist - inContext
warntmpl += ' (%s used in context' % \
', '.join(sorted(inContext))
if elsewhere:
warntmpl += ', %s known)' % ', '.join(sorted(elsewhere))
else:
warntmpl += ')'
else:
warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
for key in missing:
yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
'xmlparse')
if inContext and l10nlist and l10nlist - inContext - set(missing):
mismatch = sorted(l10nlist - inContext - set(missing))
for key in mismatch:
yield ('warning', (0, 0),
'Entity %s referenced, but %s used in context' % (
key.decode('utf-8'),
', '.join(sorted(inContext))
), 'xmlparse')
# Number check
if self.num.match(refValue) and not self.num.match(l10nValue):
yield ('warning', 0, 'reference is a number', 'number')
# CSS checks
# just a length, width="100em"
if self.length.match(refValue) and not self.length.match(l10nValue):
yield ('error', 0, 'reference is a CSS length', 'css')
# real CSS spec, style="width:100px;"
if self.style.match(refValue):
if not self.style.match(l10nValue):
yield ('error', 0, 'reference is a CSS spec', 'css')
else:
# warn if different properties or units
refMap = dict((s, u) for s, _, u in
self.spec.findall(refValue))
msgs = []
for s, _, u in self.spec.findall(l10nValue):
if s not in refMap:
msgs.insert(0, '%s only in l10n' % s)
continue
else:
ru = refMap.pop(s)
if u != ru:
msgs.append("units for %s don't match "
"(%s != %s)" % (s, u, ru))
for s in refMap.iterkeys():
msgs.insert(0, '%s only in reference' % s)
if msgs:
yield ('warning', 0, ', '.join(msgs), 'css')
if self.processContent is not None:
for t in self.processContent(self.texthandler.textcontent):
yield t
class PrincessAndroid(DTDChecker):
"""Checker for the string values that Android puts into an XML container.
http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling # noqa
has more info. Check for unescaped apostrophes and bad unicode escapes.
"""
quoted = re.compile("(?P<q>[\"']).*(?P=q)$")
def unicode_escape(self, str):
"""Helper method to try to decode all unicode escapes in a string.
This code uses the standard python decode for unicode-escape, but
that's somewhat tricky, as its input needs to be ascii. To get to
ascii, the unicode string gets converted to ascii with
backslashreplace, i.e., all non-ascii unicode chars get unicode
escaped. And then we try to roll all of that back.
Now, when that hits an error, that's from the original string, and we
need to search for the actual error position in the original string,
as the backslashreplace code changes string positions quite badly.
See also the last check in TestAndroid.test_android_dtd, with a
lengthy chinese string.
"""
val = str.encode('ascii', 'backslashreplace')
try:
val.decode('unicode-escape')
except UnicodeDecodeError, e:
args = list(e.args)
badstring = args[1][args[2]:args[3]]
i = len(args[1][:args[2]].decode('unicode-escape'))
args[2] = i
args[3] = i + len(badstring)
raise UnicodeDecodeError(*args)
@classmethod
def use(cls, file):
"""Use this Checker only for DTD files in embedding/android."""
return (file.module in ("embedding/android",
"mobile/android/base") and
cls.pattern.match(file.file))
def processContent(self, val):
"""Actual check code.
Check for unicode escapes and unescaped quotes and apostrophes,
if string's not quoted.
"""
# first, try to decode unicode escapes
try:
self.unicode_escape(val)
except UnicodeDecodeError, e:
yield ('error', e.args[2], e.args[4], 'android')
# check for unescaped single or double quotes.
# first, see if the complete string is single or double quoted,
# that changes the rules
m = self.quoted.match(val)
if m:
q = m.group('q')
offset = 0
val = val[1:-1] # strip quotes
else:
q = "[\"']"
offset = -1
stray_quot = re.compile(r"[\\\\]*(%s)" % q)
for m in stray_quot.finditer(val):
if len(m.group(0)) % 2:
# found an unescaped single or double quote, which message?
if m.group(1) == '"':
msg = u"Quotes in Android DTDs need escaping with \\\" "\
u"or \\u0022, or put string in apostrophes."
else:
msg = u"Apostrophes in Android DTDs need escaping with "\
u"\\' or \\u0027, or use \u2019, or put string in "\
u"quotes."
yield ('error', m.end(0)+offset, msg, 'android')
def getChecker(file, reference=None):
if PropertiesChecker.use(file):
return PropertiesChecker()
if PrincessAndroid.use(file):
return PrincessAndroid(reference)
if DTDChecker.use(file):
return DTDChecker(reference)
return None
|