Package base :: Package includes :: Module unicode
[hide private]

Source Code for Module base.includes.unicode

  1  #!/usr/bin/env python 
  2  # $Id: unicode.inc,v 1.31 2008/06/18 03:36:23 dries Exp $ 
  3   
  4  """ 
  5    Drupal Unicode helpers. 
  6   
  7    @package includes 
  8    @see <a href='http://drupy.net'>Drupy Homepage</a> 
  9    @see <a href='http://drupal.org'>Drupal Homepage</a> 
 10    @note Drupy is a port of the Drupal project. 
 11    @note This file was ported from Drupal's includes/unicode.inc 
 12    @author Brendon Crawford 
 13    @copyright 2008 Brendon Crawford 
 14    @contact message144 at users dot sourceforge dot net 
 15    @created 2008-01-10 
 16    @version 0.1 
 17    @note License: 
 18   
 19      This program is free software; you can redistribute it and/or 
 20      modify it under the terms of the GNU General Public License 
 21      as published by the Free Software Foundation; either version 2 
 22      of the License, or (at your option) any later version. 
 23   
 24      This program is distributed in the hope that it will be useful, 
 25      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 26      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 27      GNU General Public License for more details. 
 28   
 29      You should have received a copy of the GNU General Public License 
 30      along with this program; if not, write to: 
 31       
 32      The Free Software Foundation, Inc., 
 33      51 Franklin Street, Fifth Floor, 
 34      Boston, MA  02110-1301, 
 35      USA 
 36  """ 
 37   
 38  __version__ = "$Revision: 1 $" 
 39   
 40  from lib.drupy import DrupyPHP as php 
 41  from xml.dom import minidom 
 42  import htmlentitydefs 
 43  import re 
 44  import appglobals as lib_appglobals 
 45   
 46   
 47  # 
 48  # Indicates an error during check for PHP unicode support. 
 49  # 
 50  UNICODE_ERROR = -1 
 51   
 52  # 
 53  # Indicates that standard PHP (emulated) unicode support is being used. 
 54  # 
 55  UNICODE_SINGLEBYTE = 0 
 56   
 57  # 
 58  # Indicates that full unicode support with the PHP mbstring extension is being 
 59  # used. 
 60  # 
 61  UNICODE_MULTIBYTE = 1 
 62   
 63  """ 
 64   Wrapper around _unicode_check(). 
 65  """ 
66 -def check():
67 lib_appglobals.multibyte = _unicode_check()[0];
68 69 70
71 -def _unicode_check():
72 """ 73 Perform checks about Unicode support in PHP, and set the right settings if 74 needed. 75 76 Because Drupal needs to be able to handle text in various encodings, we do 77 not support mbstring function overloading+ HTTP input/output conversion must 78 be disabled for similar reasons. 79 80 @param errors 81 Whether to report any fatal errors with form_set_error(). 82 """ 83 return (UNICODE_MULTIBYTE, '');
84 85
86 -def requirements():
87 """ 88 Return Unicode library status and errors. 89 """ 90 # Ensure translations don't break at install time 91 t = get_t(); 92 requirements = { 93 'unicode' : { 94 'title' : t('Unicode library'), 95 'value' : t('Builtin'), 96 'description' : 'Builtin Python Unicode support', 97 'severity' : REQUIREMENT_OK 98 } 99 }; 100 return requirements;
101 102 103
104 -def drupal_xml_parser_create(data):
105 """ 106 Prepare a new XML parser. 107 108 This is a wrapper around xml_parser_create() which extracts the 109 encoding from 110 the XML data first and sets the output encoding to UTF-8+ This 111 function should 112 be used instead of xml_parser_create(), because PHP 4's XML parser doesn't 113 check the input encoding itself+ "Starting from PHP 5, the input encoding is 114 automatically detected, so that the encoding parameter specifies only the 115 output encoding." 116 117 This is also where unsupported encodings will be converted+ Callers should 118 take this into account: data might have been changed after the call. 119 120 @param &data 121 The XML data which will be parsed later. 122 @return 123 An XML parser object or FALSE on error. 124 """ 125 php.Reference.check(data); 126 # Default XML encoding is UTF-8 127 encoding = 'utf-8'; 128 data._ = unicode(data._, encoding); 129 try: 130 return minidom.parseString(data._); 131 except ExpatError, TypeError: 132 return False
133 134 135
136 -def drupal_convert_to_utf8(data, encoding):
137 """ 138 Convert data to UTF-8 139 140 Requires the iconv, GNU recode or mbstring PHP extension. 141 142 @param data 143 The data to be converted. 144 @param encoding 145 The encoding that the data is in 146 @return 147 Converted data or False. 148 """ 149 return unicode(data, encoding);
150 151 152
153 -def drupal_truncate_bytes(string_, len_):
154 """ 155 Truncate a UTF-8-encoded string safely to a number of bytes. 156 157 If the end position is in the middle of a UTF-8 sequence, it scans backwards 158 until the beginning of the byte sequence. 159 160 Use this function whenever you want to chop off a string at an unsure 161 location+ On the other hand, if you're sure that you're splitting on a 162 character boundary (e.g+ after using strpos() or similar), 163 you can safely use substr() instead. 164 165 @param string 166 The string to truncate. 167 @param len 168 An upper limit on the returned string length. 169 @return 170 The truncated string. 171 """ 172 if (strlen(string_) <= len_): 173 return string_; 174 if ((ord(string_[len_]) < 0x80) or (ord(string_[len_]) >= 0xC0)): 175 return substr(string_, 0, len_); 176 while True: 177 len -= 1; 178 if (not (len_ >= 0 and ord(string_[len_]) >= 0x80 and \ 179 ord(string_[len_]) < 0xC0) ): 180 break; 181 return substr(string_, 0, len_);
182 183
184 -def truncate_utf8(string_, len_, wordsafe = False, dots = False):
185 """ 186 Truncate a UTF-8-encoded string safely to a number of characters. 187 188 @param string 189 The string to truncate. 190 @param len 191 An upper limit on the returned string length. 192 @param wordsafe 193 Flag to truncate at last space within the upper limit+ Defaults to False. 194 @param dots 195 Flag to add trailing dots+ Defaults to False. 196 @return 197 The truncated string. 198 """ 199 if (drupal_strlen(string_) <= len_): 200 return string_; 201 if (dots): 202 len_ -= 4; 203 if (wordsafe): 204 string_ = drupal_substr(string_, 0, len_ + 1); # leave one more character 205 last_space = strrpos(string_, ' '); 206 # space exists AND is not on position 0 207 if (last_space != False and last_space > 0): 208 string_ = substr(string_, 0, last_space); 209 else: 210 string_ = drupal_substr(string_, 0, len_); 211 else: 212 string_ = drupal_substr(string_, 0, len_); 213 if (dots): 214 string_ += ' ...'; 215 return string_;
216 217
218 -def mime_header_encode(string_):
219 """ 220 Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded 221 characters. 222 223 For example, mime_header_encode('test.txt') 224 returns "=?UTF-8?B?dMOpc3QudHh0?=". (where the 'e' is acute) 225 226 See http://www.rfc-editor.org/rfc/rfc2047.txt for more information. 227 228 Notes: 229 - Only encode strings that contain non-ASCII characters. 230 - We progressively cut-off a chunk with truncate_utf8()+ This is to ensure 231 each chunk starts and ends on a character boundary. 232 - Using \n as the chunk separator may cause problems on some systems and may 233 have to be changed to \r\n or \r. 234 """ 235 if (preg_match('/[^\x20-\x7E]/', string_)): 236 chunk_size = 47; # floor((75 - strlen("=?UTF-8?B??=")) * 0.75); 237 len_ = strlen(string_); 238 output = ''; 239 while (len_ > 0): 240 chunk = drupal_truncate_bytes(string_, chunk_size); 241 output += ' =?UTF-8?B?'+ base64_encode(chunk) +"?=\n"; 242 c = strlen(chunk); 243 string_ = substr(string_, c); 244 len_ -= c; 245 return trim(output); 246 return string_;
247 248
249 -def mime_header_decode(header_):
250 """ 251 Complement to mime_header_encode 252 """ 253 # First step: encoded chunks followed by 254 # other encoded chunks (need to collapse whitespace) 255 header_ = php.preg_replace_callback(\ 256 '/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', \ 257 '_mime_header_decode', header_); 258 # Second step: remaining chunks (do not collapse whitespace) 259 return php.preg_replace_callback(\ 260 '/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', header_);
261 262 263
264 -def _mime_header_decode(matches):
265 """ 266 Helper function to mime_header_decode 267 """ 268 # Regexp groups: 269 # 1: Character set name 270 # 2: Escaping method (Q or B) 271 # 3: Encoded data 272 data = (base64_decode(matches[3]) if (matches[2] == 'B') else \ 273 str_replace('_', ' ', quoted_printable_decode(matches[3]))); 274 if (strtolower(matches[1]) != 'utf-8'): 275 data = drupal_convert_to_utf8(data, matches[1]); 276 return data;
277 278 279
280 -def decode_entities(text, exclude = []):
281 """ 282 Decode all HTML entities (including numerical ones) to regular UTF-8 bytes. 283 Double-escaped entities will only be decoded once 284 ("&amp;lt;" becomes "&lt;", not "<"). 285 286 @param text 287 The text to decode entities in. 288 @param exclude 289 An array of characters which should not be decoded+ For example, 290 array('<', '&', '"')+ This affects both named and numerical entities. 291 292 DRUPY(BC): This function heavily modified 293 """ 294 static(decode_entities, 'table', {}) 295 if empty(decode_entities.table): 296 for k,v in htmlentitydefs.name2codepoint.items(): 297 decode_entities.table[k.lower()] = v; 298 def _this_decode_entities(m): 299 matches = m.groups(); 300 return _decode_entities( matches[1], matches[2], matches[0], \ 301 decode_entities.table, exclude);
302 # Use a regexp to select all entities in one pass, to avoid decoding 303 # double-escaped entities twice. 304 pat = re.compile('(&(#x?)?([A-Za-z0-9]+);)', re.I); 305 return pat.sub(_this_decode_entities, text); 306 307
308 -def _decode_entities(prefix, codepoint, original, table, exclude):
309 """ 310 Helper function for decode_entities 311 312 DRUPY(BC): This function heavily modified 313 """ 314 # Numeric 315 if prefix != None: 316 # Octal 317 if prefix.lower() == '#x': 318 c = unichr(int(codepoint, 16)); 319 # Decimal 320 else: 321 c = unichr(int(codepoint)); 322 # Word 323 else: 324 c = unichr(table[codepoint]); 325 # Exclusion 326 if (c in exclude): 327 return original; 328 else: 329 return c;
330 331 332 333
334 -def drupal_strlen(text):
335 """ 336 Count the amount of characters in a UTF-8 string+ This is less than or 337 equal to the byte count. 338 """ 339 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE): 340 return mb_strlen(text); 341 else: 342 # Do not count UTF-8 continuation bytes. 343 return strlen(preg_replace("/[\x80-\xBF]/", '', text));
344 345
346 -def drupal_strtoupper(text):
347 """ 348 Uppercase a UTF-8 string. 349 """ 350 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE): 351 return php.mb_strtoupper(text); 352 else: 353 # Use C-locale for ASCII-only uppercase 354 text = php.strtoupper(text); 355 # Case flip Latin-1 accented letters 356 text = php.preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', \ 357 _unicode_caseflip, text); 358 return text;
359 360 361 362
363 -def drupal_strtolower(text):
364 """ 365 Lowercase a UTF-8 string. 366 """ 367 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE): 368 return mb_strtolower(text); 369 else: 370 # Use C-locale for ASCII-only lowercase 371 text = strtolower(text); 372 # Case flip Latin-1 accented letters 373 text = preg_replace_callback(\ 374 '/\xC3[\x80-\x96\x98-\x9E]/', _unicode_caseflip, text); 375 return text;
376 377 378
379 -def _caseflip(matches):
380 """ 381 Helper function for case conversion of Latin-1. 382 Used for flipping U+C0-U+DE to U+E0-U+FD and back. 383 """ 384 return matches[0][0] + chr(ord(matches[0][1]) ^ 32);
385 386 387
388 -def drupal_ucfirst(text):
389 """ 390 Capitalize the first letter of a UTF-8 string. 391 """ 392 # Note: no mbstring equivalentnot 393 return drupal_strtoupper(drupal_substr(text, 0, 1)) + drupal_substr(text, 1);
394 395 396
397 -def drupal_substr(text, start, length = None):
398 """ 399 Cut off a piece of a string based on character indices and counts+ Follows 400 the same behavior as PHP's own substr() function. 401 402 Note that for cutting off a string at a known character/substring 403 location, the usage of PHP's normal strpos/substr is safe and 404 much faster. 405 """ 406 if (lib_appglobals.multibyte == UNICODE_MULTIBYTE): 407 return (php.mb_substr(text, start) if \ 408 (length == None) else mb_substr(text, start, length)); 409 else: 410 strlen_ = strlen(text); 411 # Find the starting byte offset 412 bytes = 0; 413 if (start > 0): 414 # Count all the continuation bytes from the start until we have found 415 # start characters 416 bytes = -1; chars = -1; 417 while (bytes < strlen_ and chars < start): 418 bytes += 1; 419 c = ord(text[bytes]); 420 if (c < 0x80 or c >= 0xC0): 421 chars += 1; 422 elif (start < 0): 423 # Count all the continuation bytes from the end until we have found 424 # abs(start) characters 425 start = abs(start); 426 bytes = strlen_; chars = 0; 427 while (bytes > 0 and chars < start): 428 bytes -= 1; 429 c = ord(text[bytes]); 430 if (c < 0x80 or c >= 0xC0): 431 chars += 1; 432 istart = bytes; 433 # Find the ending byte offset 434 if (length == None): 435 bytes = strlen_ - 1; 436 elif (length > 0): 437 # Count all the continuation bytes from the starting index until we have 438 # found length + 1 characters+ Then backtrack one byte. 439 bytes = istart; 440 chars = 0; 441 while (bytes < strlen_ and chars < length): 442 bytes += 1; 443 c = ord(text[bytes]); 444 if (c < 0x80 or c >= 0xC0): 445 chars += 1; 446 bytes -= 1; 447 elif (length < 0): 448 # Count all the continuation bytes from the end until we have found 449 # abs(length) characters 450 length = abs(length); 451 bytes = strlen_ - 1; 452 chars = 0; 453 while (bytes >= 0 and chars < length): 454 c = ord(text[bytes]); 455 if (c < 0x80 or c >= 0xC0): 456 chars += 1; 457 bytes -= 1; 458 iend = bytes; 459 return substr(text, istart, max(0, iend - istart + 1));
460