mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 00:03:57 -04:00 
			
		
		
		
	unaccent: Make generate_unaccent_rules.py Python 3 compatible
Python 2 is still supported. Author: Hugh Ranalli <hugh@whtc.ca> Discussion: https://www.postgresql.org/message-id/CAAhbUMNyZ+PhNr_mQ=G161K0-hvbq13Tz2is9M3WK+yX9cQOCw@mail.gmail.com
This commit is contained in:
		
							parent
							
								
									d33faa285b
								
							
						
					
					
						commit
						3d59da9ccd
					
				| @ -1,4 +1,4 @@ | |||||||
| #!/usr/bin/python2 | #!/usr/bin/python | ||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
| # | # | ||||||
| # This script builds unaccent.rules on standard output when given the | # This script builds unaccent.rules on standard output when given the | ||||||
| @ -23,6 +23,24 @@ | |||||||
| # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt | # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt | ||||||
| # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml | # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml | ||||||
| 
 | 
 | ||||||
|  | # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped | ||||||
|  | # The approach is to be Python3 compatible with Python2 "backports". | ||||||
|  | from __future__ import print_function | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | import codecs | ||||||
|  | import sys | ||||||
|  | 
 | ||||||
|  | if sys.version_info[0] <= 2: | ||||||
|  |     # Encode stdout as UTF-8, so we can just print to it | ||||||
|  |     sys.stdout = codecs.getwriter('utf8')(sys.stdout) | ||||||
|  | 
 | ||||||
|  |     # Map Python 2's chr to unichr | ||||||
|  |     chr = unichr | ||||||
|  | 
 | ||||||
|  |     # Python 2 and 3 compatible bytes call | ||||||
|  |     def bytes(source, encoding='ascii', errors='strict'): | ||||||
|  |         return source.encode(encoding=encoding, errors=errors) | ||||||
|  | # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped | ||||||
| 
 | 
 | ||||||
| import re | import re | ||||||
| import argparse | import argparse | ||||||
| @ -39,7 +57,7 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case | |||||||
|                        (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA |                        (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA | ||||||
| 
 | 
 | ||||||
| def print_record(codepoint, letter): | def print_record(codepoint, letter): | ||||||
|     print (unichr(codepoint) + "\t" + letter).encode("UTF-8") |     print (chr(codepoint) + "\t" + letter) | ||||||
| 
 | 
 | ||||||
| class Codepoint: | class Codepoint: | ||||||
|     def __init__(self, id, general_category, combining_ids): |     def __init__(self, id, general_category, combining_ids): | ||||||
| @ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): | |||||||
|     charactersSet = set() |     charactersSet = set() | ||||||
| 
 | 
 | ||||||
|     # RegEx to parse rules |     # RegEx to parse rules | ||||||
|     rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') |     rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') | ||||||
| 
 | 
 | ||||||
|     # construct tree from XML |     # construct tree from XML | ||||||
|     transliterationTree = ET.parse(latinAsciiFilePath) |     transliterationTree = ET.parse(latinAsciiFilePath) | ||||||
| @ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): | |||||||
|         # Group 3: plain "trg" char. Empty if group 4 is not. |         # Group 3: plain "trg" char. Empty if group 4 is not. | ||||||
|         # Group 4: plain "trg" char between quotes. Empty if group 3 is not. |         # Group 4: plain "trg" char between quotes. Empty if group 3 is not. | ||||||
|         if matches is not None: |         if matches is not None: | ||||||
|             src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape') |             src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape') | ||||||
|             trg = matches.group(3) if matches.group(3) is not None else matches.group(4) |             trg = matches.group(3) if matches.group(3) is not None else matches.group(4) | ||||||
| 
 | 
 | ||||||
|             # "'" and """ are escaped |             # "'" and """ are escaped | ||||||
| @ -195,10 +213,10 @@ def main(args): | |||||||
|            len(codepoint.combining_ids) > 1: |            len(codepoint.combining_ids) > 1: | ||||||
|             if is_letter_with_marks(codepoint, table): |             if is_letter_with_marks(codepoint, table): | ||||||
|                 charactersSet.add((codepoint.id, |                 charactersSet.add((codepoint.id, | ||||||
|                              unichr(get_plain_letter(codepoint, table).id))) |                              chr(get_plain_letter(codepoint, table).id))) | ||||||
|             elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): |             elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): | ||||||
|                 charactersSet.add((codepoint.id, |                 charactersSet.add((codepoint.id, | ||||||
|                              "".join(unichr(combining_codepoint.id) |                              "".join(chr(combining_codepoint.id) | ||||||
|                                      for combining_codepoint \ |                                      for combining_codepoint \ | ||||||
|                                      in get_plain_letters(codepoint, table)))) |                                      in get_plain_letters(codepoint, table)))) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user