1
2
3
4 """
5 Drupal Unicode helpers.
6
7 @package includes
8 @see <a href='http://drupy.net'>Drupy Homepage</a>
9 @see <a href='http://drupal.org'>Drupal Homepage</a>
10 @note Drupy is a port of the Drupal project.
11 @note This file was ported from Drupal's includes/unicode.inc
12 @author Brendon Crawford
13 @copyright 2008 Brendon Crawford
14 @contact message144 at users dot sourceforge dot net
15 @created 2008-01-10
16 @version 0.1
17 @note License:
18
19 This program is free software; you can redistribute it and/or
20 modify it under the terms of the GNU General Public License
21 as published by the Free Software Foundation; either version 2
22 of the License, or (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program; if not, write to:
31
32 The Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor,
34 Boston, MA 02110-1301,
35 USA
36 """
37
38 __version__ = "$Revision: 1 $"
39
40 from lib.drupy import DrupyPHP as php
41 from xml.dom import minidom
42 import htmlentitydefs
43 import re
44 import appglobals as lib_appglobals
45
46
47
48
49
50 UNICODE_ERROR = -1
51
52
53
54
55 UNICODE_SINGLEBYTE = 0
56
57
58
59
60
61 UNICODE_MULTIBYTE = 1
62
63 """
64 Wrapper around _unicode_check().
65 """
68
69
70
72 """
73 Perform checks about Unicode support in PHP, and set the right settings if
74 needed.
75
76 Because Drupal needs to be able to handle text in various encodings, we do
77 not support mbstring function overloading+ HTTP input/output conversion must
78 be disabled for similar reasons.
79
80 @param errors
81 Whether to report any fatal errors with form_set_error().
82 """
83 return (UNICODE_MULTIBYTE, '');
84
85
87 """
88 Return Unicode library status and errors.
89 """
90
91 t = get_t();
92 requirements = {
93 'unicode' : {
94 'title' : t('Unicode library'),
95 'value' : t('Builtin'),
96 'description' : 'Builtin Python Unicode support',
97 'severity' : REQUIREMENT_OK
98 }
99 };
100 return requirements;
101
102
103
105 """
106 Prepare a new XML parser.
107
108 This is a wrapper around xml_parser_create() which extracts the
109 encoding from
110 the XML data first and sets the output encoding to UTF-8+ This
111 function should
112 be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
113 check the input encoding itself+ "Starting from PHP 5, the input encoding is
114 automatically detected, so that the encoding parameter specifies only the
115 output encoding."
116
117 This is also where unsupported encodings will be converted+ Callers should
118 take this into account: data might have been changed after the call.
119
120 @param &data
121 The XML data which will be parsed later.
122 @return
123 An XML parser object or FALSE on error.
124 """
125 php.Reference.check(data);
126
127 encoding = 'utf-8';
128 data._ = unicode(data._, encoding);
129 try:
130 return minidom.parseString(data._);
131 except ExpatError, TypeError:
132 return False
133
134
135
137 """
138 Convert data to UTF-8
139
140 Requires the iconv, GNU recode or mbstring PHP extension.
141
142 @param data
143 The data to be converted.
144 @param encoding
145 The encoding that the data is in
146 @return
147 Converted data or False.
148 """
149 return unicode(data, encoding);
150
151
152
154 """
155 Truncate a UTF-8-encoded string safely to a number of bytes.
156
157 If the end position is in the middle of a UTF-8 sequence, it scans backwards
158 until the beginning of the byte sequence.
159
160 Use this function whenever you want to chop off a string at an unsure
161 location+ On the other hand, if you're sure that you're splitting on a
162 character boundary (e.g+ after using strpos() or similar),
163 you can safely use substr() instead.
164
165 @param string
166 The string to truncate.
167 @param len
168 An upper limit on the returned string length.
169 @return
170 The truncated string.
171 """
172 if (strlen(string_) <= len_):
173 return string_;
174 if ((ord(string_[len_]) < 0x80) or (ord(string_[len_]) >= 0xC0)):
175 return substr(string_, 0, len_);
176 while True:
177 len -= 1;
178 if (not (len_ >= 0 and ord(string_[len_]) >= 0x80 and \
179 ord(string_[len_]) < 0xC0) ):
180 break;
181 return substr(string_, 0, len_);
182
183
184 -def truncate_utf8(string_, len_, wordsafe = False, dots = False):
185 """
186 Truncate a UTF-8-encoded string safely to a number of characters.
187
188 @param string
189 The string to truncate.
190 @param len
191 An upper limit on the returned string length.
192 @param wordsafe
193 Flag to truncate at last space within the upper limit+ Defaults to False.
194 @param dots
195 Flag to add trailing dots+ Defaults to False.
196 @return
197 The truncated string.
198 """
199 if (drupal_strlen(string_) <= len_):
200 return string_;
201 if (dots):
202 len_ -= 4;
203 if (wordsafe):
204 string_ = drupal_substr(string_, 0, len_ + 1);
205 last_space = strrpos(string_, ' ');
206
207 if (last_space != False and last_space > 0):
208 string_ = substr(string_, 0, last_space);
209 else:
210 string_ = drupal_substr(string_, 0, len_);
211 else:
212 string_ = drupal_substr(string_, 0, len_);
213 if (dots):
214 string_ += ' ...';
215 return string_;
216
217
219 """
220 Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
221 characters.
222
223 For example, mime_header_encode('test.txt')
224 returns "=?UTF-8?B?dMOpc3QudHh0?=". (where the 'e' is acute)
225
226 See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
227
228 Notes:
229 - Only encode strings that contain non-ASCII characters.
230 - We progressively cut-off a chunk with truncate_utf8()+ This is to ensure
231 each chunk starts and ends on a character boundary.
232 - Using \n as the chunk separator may cause problems on some systems and may
233 have to be changed to \r\n or \r.
234 """
235 if (preg_match('/[^\x20-\x7E]/', string_)):
236 chunk_size = 47;
237 len_ = strlen(string_);
238 output = '';
239 while (len_ > 0):
240 chunk = drupal_truncate_bytes(string_, chunk_size);
241 output += ' =?UTF-8?B?'+ base64_encode(chunk) +"?=\n";
242 c = strlen(chunk);
243 string_ = substr(string_, c);
244 len_ -= c;
245 return trim(output);
246 return string_;
247
248
250 """
251 Complement to mime_header_encode
252 """
253
254
255 header_ = php.preg_replace_callback(\
256 '/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', \
257 '_mime_header_decode', header_);
258
259 return php.preg_replace_callback(\
260 '/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', header_);
261
262
263
265 """
266 Helper function to mime_header_decode
267 """
268
269
270
271
272 data = (base64_decode(matches[3]) if (matches[2] == 'B') else \
273 str_replace('_', ' ', quoted_printable_decode(matches[3])));
274 if (strtolower(matches[1]) != 'utf-8'):
275 data = drupal_convert_to_utf8(data, matches[1]);
276 return data;
277
278
279
281 """
282 Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
283 Double-escaped entities will only be decoded once
284 ("&lt;" becomes "<", not "<").
285
286 @param text
287 The text to decode entities in.
288 @param exclude
289 An array of characters which should not be decoded+ For example,
290 array('<', '&', '"')+ This affects both named and numerical entities.
291
292 DRUPY(BC): This function heavily modified
293 """
294 static(decode_entities, 'table', {})
295 if empty(decode_entities.table):
296 for k,v in htmlentitydefs.name2codepoint.items():
297 decode_entities.table[k.lower()] = v;
298 def _this_decode_entities(m):
299 matches = m.groups();
300 return _decode_entities( matches[1], matches[2], matches[0], \
301 decode_entities.table, exclude);
302
303
304 pat = re.compile('(&(#x?)?([A-Za-z0-9]+);)', re.I);
305 return pat.sub(_this_decode_entities, text);
306
307
309 """
310 Helper function for decode_entities
311
312 DRUPY(BC): This function heavily modified
313 """
314
315 if prefix != None:
316
317 if prefix.lower() == '#x':
318 c = unichr(int(codepoint, 16));
319
320 else:
321 c = unichr(int(codepoint));
322
323 else:
324 c = unichr(table[codepoint]);
325
326 if (c in exclude):
327 return original;
328 else:
329 return c;
330
331
332
333
335 """
336 Count the amount of characters in a UTF-8 string+ This is less than or
337 equal to the byte count.
338 """
339 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE):
340 return mb_strlen(text);
341 else:
342
343 return strlen(preg_replace("/[\x80-\xBF]/", '', text));
344
345
347 """
348 Uppercase a UTF-8 string.
349 """
350 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE):
351 return php.mb_strtoupper(text);
352 else:
353
354 text = php.strtoupper(text);
355
356 text = php.preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', \
357 _unicode_caseflip, text);
358 return text;
359
360
361
362
364 """
365 Lowercase a UTF-8 string.
366 """
367 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE):
368 return mb_strtolower(text);
369 else:
370
371 text = strtolower(text);
372
373 text = preg_replace_callback(\
374 '/\xC3[\x80-\x96\x98-\x9E]/', _unicode_caseflip, text);
375 return text;
376
377
378
380 """
381 Helper function for case conversion of Latin-1.
382 Used for flipping U+C0-U+DE to U+E0-U+FD and back.
383 """
384 return matches[0][0] + chr(ord(matches[0][1]) ^ 32);
385
386
387
394
395
396
398 """
399 Cut off a piece of a string based on character indices and counts+ Follows
400 the same behavior as PHP's own substr() function.
401
402 Note that for cutting off a string at a known character/substring
403 location, the usage of PHP's normal strpos/substr is safe and
404 much faster.
405 """
406 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE):
407 return (php.mb_substr(text, start) if \
408 (length == None) else mb_substr(text, start, length));
409 else:
410 strlen_ = strlen(text);
411
412 bytes = 0;
413 if (start > 0):
414
415
416 bytes = -1; chars = -1;
417 while (bytes < strlen_ and chars < start):
418 bytes += 1;
419 c = ord(text[bytes]);
420 if (c < 0x80 or c >= 0xC0):
421 chars += 1;
422 elif (start < 0):
423
424
425 start = abs(start);
426 bytes = strlen_; chars = 0;
427 while (bytes > 0 and chars < start):
428 bytes -= 1;
429 c = ord(text[bytes]);
430 if (c < 0x80 or c >= 0xC0):
431 chars += 1;
432 istart = bytes;
433
434 if (length == None):
435 bytes = strlen_ - 1;
436 elif (length > 0):
437
438
439 bytes = istart;
440 chars = 0;
441 while (bytes < strlen_ and chars < length):
442 bytes += 1;
443 c = ord(text[bytes]);
444 if (c < 0x80 or c >= 0xC0):
445 chars += 1;
446 bytes -= 1;
447 elif (length < 0):
448
449
450 length = abs(length);
451 bytes = strlen_ - 1;
452 chars = 0;
453 while (bytes >= 0 and chars < length):
454 c = ord(text[bytes]);
455 if (c < 0x80 or c >= 0xC0):
456 chars += 1;
457 bytes -= 1;
458 iend = bytes;
459 return substr(text, istart, max(0, iend - istart + 1));
460