1 '''
2 Defines classes and functions for parsing bodies of text to find words and
3 prepare them for output to the user.
4
5 The top-level functions in this module are optimized to build L{Word}s from
6 bodies of text containing more than a single L{Word}. A chunking scheme based
7 on the average length of words in the English language reduces the number of
8 calls to L{Word.append} and generally outperforms single character at a time
9 processing (at least for English text).
10
11 @var VOWELS: Vowels in the used to determine if a word can be spoken
12 @type VOWELS: string
13
14 @author: Peter Parente
15 @author: Larry Weiss
16 @organization: IBM Corporation
17 @copyright: Copyright (c) 2005, 2007 IBM Corporation
18 @license: The BSD License
19
20 All rights reserved. This program and the accompanying materials are made
21 available under the terms of the BSD license which accompanies
22 this distribution, and is available at
23 U{http://www.opensource.org/licenses/bsd-license.php}
24 '''
25 import unicodedata as ud
26 from AccessEngine import AEState
27 from AccessEngine.AEPor import AEPor
28 from AccessEngine.AEConstants import WORD_NON_BLANK, WORD_ALPHABETIC, WORD_ALPHA_NUMERIC, \
29 WORD_ALPHA_PUNCT, WORD_ALPHA_NUMERIC_PUNCT, WORD_LAST
30 from Tools.i18n import _
31
32
33 VOWELS = _('AEIOUYaeiouy')
34
36 '''
37 Settings for L{Word} parsing. This class contains the set of all settings
38 that will be respected by the parser. Subclasses may override these settings
39
40 The following variables are not truly instance variables, but are proxied
41 by L{AEState.Setting} objects.
42
43 Caps (bool): When set to True, capitalization is preserved when presenting
44 text. Defaults to True.
45
46 MaxRepeat (integer): Specifies the minimum number of times a character must
47 be found in sequence before it is considered a repeat. Defaults to 4.
48
49 WordDef (enum): Set to NON_BLANK to define the main part of a word to only
50 include non-blank characters. Set to ALPHABETIC to define the main part
51 of a word to only include alphabetic characters. Set to ALPHA_NUMERIC to
52 define the main part of a word to only include alphabetic and numeric
53 characters. Set to ALPHA_NUMERIC_PUNCT to define the main part of a word
54 to only include characters that are alphabetic, numeric, or punctuation.
55 Defaults to NON_BLANK.
56
57 Ignore (string): Set to a string of characters that should be treated as if
58 they were blank. Defaults to the NBSP character.
59 '''
66
68 '''
69 Creates all settings for L{Word} parsing.
70 '''
71 self.newBool('Caps', True, _('Preserve caps?'),
72 _('When set, capitalization is preserved in the output. '
73 'Otherwise, all strings are lowercased.'))
74 self.newNumeric('MaxRepeat', 4, _('Maximum repeat'), 1, 100, 0,
75 _('Defines the maximum number of times a '
76 'character should be allowed to repeat without the word '
77 'being spelled.'))
78 self.newEnum('WordDef', WORD_NON_BLANK, _('Word definition'),
79 {_('Non-blank') : WORD_NON_BLANK,
80 _('Alphanumeric and punct.') : WORD_ALPHA_NUMERIC_PUNCT,
81 _('Alphanumeric') : WORD_ALPHA_NUMERIC,
82 _('Alpha and punct.') : WORD_ALPHA_PUNCT,
83 _('Alphabetic') : WORD_ALPHABETIC},
84 _('Defines the characters that comprise a word.'))
85 self.newString('Ignore', u'\u00a0', _('Ignored characters'),
86 _('Defines the characters to be treated as blanks.'))
87
89 '''
90 Dummy L{WordState} look-alike used by L{getContextFromString} as a default
91 set of settings when no state object is specified. Defines all the same
92 attributes as L{WordState}, but without instantiating unecessary settings.
93 '''
94 Caps = True
95 MaxRepeat = 4
96 WordDef = WORD_NON_BLANK
97 Ignore = u'\u00a0'
98
100 '''
101 Gets the previous, current, and next L{Word}s relative to the given L{AEPor}.
102 If any word is missing, a None value is returned in its place. The string is
103 considered to be at the zero offset of the Item indicated by the L{AEPor}. Uses
104 a default L{WordState} object if none is provided.
105
106 @param string: Text to parse for words
107 @type string: string
108 @param state: Settings used to define a word
109 @type state: L{WordState}
110 @param por: Point of regard indicating the source accessible and Item for the
111 string
112 @return: Previous, current, and next words surrounding the L{AEPor}
113 @rtype: 3-tuple of L{Word}
114 '''
115 words = buildWordsFromString(string, por, state)
116 c_off = por.char_offset
117
118 if len(words) == 0:
119 return None, por, None
120 elif len(words) == 1:
121 return None, words[0], None
122
123 for i, w in enumerate(words):
124 w_off = w.getPOR().char_offset
125 if w_off > c_off:
126 if i == 1:
127
128 return None, words[0], words[1]
129 else:
130 return words[i-2], words[i-1], words[i]
131
132 return words[i-1], words[i], None
133
136 '''
137 Parses the given string to build a list of L{Word}s using the given state and
138 the given L{AEPor}. When no L{AEPor} is given a dummy POR is constructed. Each
139 L{Word} constructed will use the provided or constructed L{AEPor} to indicate
140 it's position as if the string was from the same component and Item. The
141 character offset from the given or constructed L{AEPor} is not used. The string
142 is always considered to be at the zero offset of the Item indicated by the
143 L{AEPor}. Uses a default L{WordState} object if none is provided.
144
145 @param string: Text to parse for words
146 @type string: string
147 @param state: System settings used to define a word
148 @type state: L{WordState}
149 @param por: Point of regard indicating the source accessible and Item for the
150 string
151 @type por: L{AEPor}
152 @param main_ob: Function to invoke for each character in the main part of a
153 word
154 @type main_ob: callable
155 @param trail_ob: Function to invoke for each character in the trailing part
156 of a word
157 @type trail_ob: callable
158 @return: L{Word}s parsed from the string
159 @rtype: list of L{Word}
160 '''
161
162 count = 0
163
164 words = []
165 if por is None:
166
167 por = AEPor(None, None, 0)
168 else:
169
170 por = AEPor(por.accessible, por.item_offset, 0)
171
172 w = Word(state, por, main_ob, trail_ob)
173
174
175 chunks = (string[i:i+6] for i in xrange(0, len(string), 6))
176 for chunk in chunks:
177
178 while chunk is not None:
179
180 chunk = w.append(chunk)
181 if chunk is not None:
182
183 words.append(w)
184
185 count += w.getSourceLength()
186
187 w = Word(state, AEPor(por.accessible, por.item_offset, count),
188 main_ob, trail_ob)
189 words.append(w)
190 return words
191
193 '''
194 Represents a word in a body of text. Each L{Word} has a main and a trailing
195 part where the main part is processed according to other flags in the current
196 L{WordState} to improve its presentation to the user via a speech or other
197 output device while the trailing part remains unprocessed. The value of
198 WordDef determines what characters lie in the main and trailing parts of each
199 word. The following constants are available in L{AEConstants}.
200
201 - WORD_NON_BLANK: All non-blank characters are added to the main part
202 - WORD_ALPHABETIC: All characters considered letters in the current locale
203 are added to the main part
204 - WORD_ALPHA_NUMERIC: All characters considered letters and digits in the
205 current locale are added to the main part
206 - WORD_ALPHA_PUNCT: All characters considered letters and punctuation in
207 the current locale are added to the main part
208 - WORD_ALPHA_NUMERIC_PUNCT: All characters considered letters, digits, and
209 punctuation in the current locale are added to the main part
210
211 Characters in the ignore list are considered blank. A L{AEPor}
212 can be associated with a L{Word} to indicate its context in a larger body of
213 text.
214
215 Callables may be specified as observers for characters processed by the main
216 and trail parts of each L{Word}. An observer must take four parameters, this
217 L{Word} instance, the L{WordState} in use, the current character, and the
218 list of all characters in the main or trail part of the word. The observer
219 should return the character to be added. The list may be modified in place
220 to affect the final contents of the word.
221
222 @ivar state: Settings that determine the definition of a L{Word} and how
223 it is prepared for output
224 @type state: L{WordState}
225 @ivar por: Point of regard indicating where this L{Word} originated
226 @type por: L{AEPor}
227 @ivar source_word: Original text of this L{Word} without any preparation for
228 output applied
229 @type source_word: list
230 @ivar has_main: Has at least one main character been parsed?
231 @type has_main: boolean
232 @ivar main_part: Part of this L{Word} that will receive extra preparation for
233 output
234 @type main_part: list
235 @ivar trail_part: Part of the word that will receive little preparation for
236 output
237 @type trail_part: list
238 @ivar main_done: Is the L{main_part} complete?
239 @type main_done: boolean
240 @ivar trail_done: Is the L{trail_part} complete?
241 @type trail_done: boolean
242 @ivar more: Are there likely more L{Word}s after this one in the text source
243 where this L{Word} originated?
244 @type more: boolean
245 @ivar curr_repeat: Indicates a character should be considered a repeat iff
246 this value > MaxRepeat. It is not the exact number of repetitions of a
247 character as it is optimized for speed, not accuracy
248 @type curr_repeat: integer
249 @ivar last_char: Last character appended to this L{Word}
250 @type last_char: string
251 @ivar main_ob: Function to invoke for each character in the main part of a
252 word
253 @type main_ob: callable
254 @ivar trail_ob: Function to invoke for each character in the trailing part
255 of a word
256 @type trail_ob: callable
257 '''
258 - def __init__(self, state, por, main_ob=None, trail_ob=None):
259 '''
260 Stores the L{WordState} and initializes all instance variables.
261
262 @param state: State that defines L{Word}s and how they are processed
263 @type state: L{WordState}
264 @param por: Point of regard indicating where this L{Word} originated
265 @type por: L{AEPor}
266 @param main_ob: Function to invoke for each character in the main part of a
267 word
268 @type main_ob: callable
269 @param trail_ob: Function to invoke for each character in the trailing part
270 of a word
271 @type trail_ob: callable
272 '''
273 self.state = state
274 self.por = por
275 self.source_word = []
276 self.main_part = []
277 self.trail_part = []
278 self.has_main = False
279 self.main_done = False
280 self.trail_done = False
281 self.more = False
282 self.curr_repeat = 0
283 self.last_char = None
284 self.main_ob = main_ob
285 self.trail_ob = trail_ob
286
288 '''
289 Compares this L{Word} to the one provided based on their L{AEPor}s and
290 content. If their L{source_word}s and L{AEPor}s are the same, they are
291 considered equal.
292
293 @param other: Word to compare
294 @type other: L{Word}
295 '''
296 return (self.por == other.por) and (self.source_word == other.source_word)
297
299 '''
300 Gets this L{Word} as a unicode string.
301
302 @return: Main part of the string joined with the trail
303 @rtype: string
304 '''
305 return u''.join(self.main_part+self.trail_part)
306
308 '''
309 Gets this L{Word} as a non-unicode string.
310
311 @return: Main part of the string joined with the trail
312 @rtype: string
313 '''
314 return ''.join(self.main_part+self.trail_part)
315
316 - def _isMainChar(self, ch):
317 '''
318 Determines if the given character should be considered a part of the main
319 part of this word or not based on the definition of the word given by
320 L{WordState}.
321
322 @param ch: Character to test
323 @type ch: string
324 '''
325 if self.state.WordDef == WORD_NON_BLANK:
326 return not self.isBlank(ch)
327 elif self.state.WordDef == WORD_ALPHABETIC:
328 return self.isAlpha(ch)
329 elif self.state.WordDef == WORD_ALPHA_NUMERIC:
330 return self.isAlpha(ch) or self.isNumeric(ch)
331 elif self.state.WordDef == WORD_ALPHA_PUNCT:
332 return self.isAlpha(ch) or self.isPunctuation(ch)
333 elif self.state.WordDef == WORD_ALPHA_NUMERIC_PUNCT:
334 return self.isAlpha(ch) or self.isNumeric(ch) or self.isPunctuation(ch)
335 else:
336 return False
337
338 - def replaceMain(self, text):
339 '''
340 Replaces the main part of the word with the given string.
341
342 @param text: Text to use as the main part of the word
343 @type text: string
344 '''
345 self.main_part = text
346
348 '''
349 Replaces the main part of the word with the given string.
350
351 @param text: Text to use as the main part of the word
352 @type text: string
353 '''
354 self.main_part = text
355
356
358 '''
359 Gets the L{AEPor} associated with the start of this L{Word}.
360
361 @return: Point of regard pointing to the start of this word
362 @rtype: L{AEPor}
363 '''
364 return self.por
365
367 '''
368 Determines if the given character is blank or ignored.
369
370 @param ch: Character to test
371 @type ch: string
372 @return: Is the character a blank?
373 @rtype: boolean
374 '''
375 return ch.isspace() or ch in self.state.Ignore
376
378 '''
379 Determines if the given character is a letter in the current locale.
380
381 @param ch: Character to test
382 @type ch: string
383 @return: Is the character a letter?
384 @rtype: boolean
385 '''
386 return ch.isalpha()
387
389 '''
390 Determines if the given character is a number in the current locale.
391
392 @param ch: Character to test
393 @type ch: string
394 @return: Is the character a number?
395 @rtype: boolean
396 '''
397 return ch.isdigit()
398
400 '''
401 Determines if the given character is a punctuation mark.
402
403 @param ch: Character to test
404 @type ch: string
405 @return: Is the character a punctuation mark?
406 @rtype: boolean
407 '''
408 cat = ud.category(unicode(ch))
409 return (cat == 'Lm' or cat[0] in ['M', 'P', 'S'])
410
412 '''
413 Determines if the given character is a symbol.
414
415 @param ch: Character to test
416 @type ch: string
417 @return: Is the character a symbol?
418 @rtype: boolean
419 '''
420 return ud.category(unicode(ch)).startswith('C')
421
423 '''
424 Determines if the given character is a vowel. Relies on a translator to
425 list all vowels in the current locale.
426
427 @param ch: Character to test
428 @type ch: string
429 @return: Is the character a Latin vowel?
430 @rtype: boolean
431 '''
432 return ch in VOWELS
433
435 '''
436 Determines if the given character is an upper case letter.
437
438 @param ch: Character to test
439 @type ch: string
440 @return: Is the character capitalized?
441 @rtype: boolean
442 '''
443 return ch.isupper()
444
446 '''
447 Gets the unicode hex value for a character sans the 0x prefix.
448
449 @param ch: Single character
450 @type ch: string
451 @return: Hex value of the character
452 @rtype: string
453 '''
454 return hex(ord(ch))[2:]
455
457 '''
458 Gets the unicode name of the character, one of the strings listed in the
459 U{http://unicode.org/charts/charindex.html}. If the character could not be
460 determined from the given string, returns an empty string. Note that these
461 names are not localized.
462
463 @param ch: Single character
464 @type ch: string
465 @return: Name of the character
466 @rtype: string
467 '''
468 try:
469 return ud.name(unicode(ch)).lower()
470 except Exception:
471 return ''
472
474 '''
475 Gets a localized description of the given character. The most detailed
476 description for a character is returned so that, for instance, 'e' is
477 described as a vowel and not just a letter.
478
479 @param ch: Character to test
480 @type ch: string
481 @return: Localized description of the character according to the processing
482 done by this L{Word} class and based on the current state
483 @rtype: boolean
484 '''
485 if ch in self.state.Ignore:
486 return _('ignored')
487 elif self.isBlank(ch):
488 return _('blank')
489 elif self.isAlpha(ch):
490 if self.isCap(ch):
491 return _('capital')
492 elif self.isVowel(ch):
493 return _('vowel')
494 else:
495 return _('letter')
496 elif self.isNumeric(ch):
497 return _('number')
498 elif self.isPunctuation(ch):
499 return _('punctuation')
500 elif self.isSymbol(ch):
501 return _('symbol')
502
504 '''
505 Gets the unprocessed text of this word as it was seen in the original text
506 source.
507
508 @return: Parsed word without any processing applied
509 @rtype: string
510 '''
511 return self.source_word
512
514 '''
515 Gets the length of the unprocessed source text of this L{Word}.
516
517 @return: Length of the L{source_word}
518 @rtype: integer
519 '''
520 return len(self.source_word)
521
522 - def getMainLength(self):
523 '''
524 Gets the length of the processed main part of this L{Word}.
525
526 @return: Length of the L{main_part}
527 @rtype: integer
528 '''
529 return len(self.main_part)
530
532 '''
533 Makes a guess as to whether or not there are more L{Word}s in the body of
534 text from which this word originated. This guess is based on whether or not
535 the last chunk passed to L{append} was processed in full.
536
537 @return: Are there likely more L{Word}s in the original body of text
538 @rtype: boolean
539 '''
540 return self.more
541
543 '''
544 Gets if this L{Word} has a character repeated more than the maximum number
545 of repetitions allowed or not.
546
547 @return: Does this L{Word} containg a repeated character?
548 @rtype: boolean
549 '''
550 if self.curr_repeat > self.state.MaxRepeat and not self.isAllNumeric():
551 return True
552 return False
553
555 '''
556 Gets if this L{Word} contains an uppercase letter or not.
557
558 @return: Does this L{Word} contain a capital letter?
559 @rtype: boolean
560 '''
561 for ch in self.source_word:
562 if self.isCap(ch):
563 return True
564 return False
565
567 '''
568 Gets if this L{Word} contains a vowel or not.
569
570 @return: Does this L{Word} contain a vowel?
571 @rtype: boolean
572 '''
573 for ch in self.source_word:
574 if self.isVowel(ch):
575 return True
576 return False
577
579 '''
580 Gets if this L{Word} is all capitals or not.
581
582 @return: Is this L{Word} all capital letters?
583 @rtype: boolean
584 '''
585 return self.isCap(self.source_word)
586
588 '''
589 Gets if this L{Word} is all numbers or not.
590
591 @return: Is this L{Word} all numbers?
592 @rtype: boolean
593 '''
594 return self.isNumeric(self.source_word)
595
597 '''
598 Gets if this L{Word} is all blanks or not.
599
600 @return: Is this L{Word} all blanks?
601 @rtype: boolean
602 '''
603 for ch in self.source_word:
604 if not self.isBlank(ch):
605 return False
606 return True
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
639 '''
640 Parses the given chunk of text for characters that should be added to the
641 L{main_part} or L{trail_part} of this L{Word}. If this word has neither
642 L{main_done} or L{trail_done} set, then all main characters determined by
643 L{_isMainChar} up to the first non-main character are added to the main
644 part of this word. When the first non-main word is encountered,
645 L{main_done} is set. If this word has L{main_done} set and L{trail_done}
646 unset, all non-main characters are added to the trail part of this word.
647 When another main character is encountered after L{main_done} is set,
648 L{trail_done} is set and the remainder of the given chunk is returned
649 unprocessed to be added to another L{Word}. Once L{trail_done} is set, no
650 further text can be appended to this L{Word}.
651
652 @param chunk: Chunk of text to parse for words
653 @type chunk: string
654 @return: Unprocessed portion of the chunk or None if fully processed
655 @rtype: string or None
656 @see: L{_processMain}
657 @see: L{_processTrail}
658 '''
659 if self.trail_done:
660
661 return chunk
662 for i, ch in enumerate(chunk):
663 mc = self._isMainChar(ch)
664 if mc:
665 if self.main_done:
666
667 self.trail_done = True
668 self.more = True
669 return chunk[i:]
670 else:
671 self.has_main = True
672
673 ch = self._processMain(ch)
674 else:
675
676 ch = self._processTrail(ch)
677 self.main_done = True
678
679
680
681
682 if ch == self.last_char and ch != '.':
683 self.curr_repeat += 1
684 elif self.curr_repeat < self.state.MaxRepeat:
685 self.last_char = ch
686 self.curr_repeat = 1
687 return None
688
689 - def _processMain(self, ch):
690 '''
691 Adds the given character to the L{source_word}. If Caps is unset, makes the
692 character lowercase. If CapExpand and the character is a capital letter or
693 NumExpand and the character is a number, inserts a space in L{main_part}.
694 Finally inserts the possibly lowercased character in L{main_part}.
695
696 @param ch: Character to process
697 @type ch: string
698 @return: Character inserted in L{trail_part}
699 @rtype: string
700 '''
701 self.source_word.append(ch)
702 if ch in self.state.Ignore:
703 ch = u' '
704 if not self.state.Caps:
705 ch = ch.lower()
706 if self.main_ob:
707 ch = self.main_ob(self, self.state, ch, self.main_part)
708 self.main_part.append(ch)
709 return ch
710
712 '''
713 Adds the given character to the L{source_word}. If the character is a
714 blank, inserts a space in L{trail_part}, else inserts the character.
715
716 @param ch: Character to process
717 @type ch: string
718 @return: Character inserted in L{trail_part}
719 @rtype: string
720 '''
721 if ch in self.state.Ignore:
722 ch = u' '
723 self.source_word.append(ch)
724 if self.trail_ob:
725 ch = self.trail_ob(self, self.state, ch, self.trail_part)
726 self.trail_part.append(ch)
727 return ch
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754