diff --git a/run_all.sh b/run_all.sh index 24c5e632..9ade846d 100755 --- a/run_all.sh +++ b/run_all.sh @@ -33,14 +33,18 @@ rm -f $logmd # | mode |LetterCase| shape_path | shape filename # == 10m ================= |=========== |==========| ============| ================================================ function run10m { -./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty -./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries +./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty # this and other admin_0 run, but Mapshaper overwrites them +./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries # instead results are copied into housekeeping file's lookup table ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries_lakes ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_units ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_subunits ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_disputed_areas -./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces +./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces # this and other admin_1 run, but Mapshaper overwrites them ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes +./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # Mapshaper uses this to generate admin_1 polys +./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_label_points_details # Mapshaper uses this to generate admin_2 polys +./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties # this and other admin_2 run, but Mapshaper overwrites them +./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties_lakes ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_airports ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_populated_places # this should be build before derived Makefile themes run ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geographic_lines # this should be build before derived Makefile themes run @@ -57,7 +61,6 @@ function run10m { ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_europe ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_north_america -#./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # geometry errors } function run50m { diff --git a/tools/wikidata/Makefile b/tools/wikidata/Makefile index 92a74514..2a5439ff 100644 --- a/tools/wikidata/Makefile +++ b/tools/wikidata/Makefile @@ -6,6 +6,7 @@ install: pip3 install -U SPARQLWrapper pip3 install -U fiona pip3 install -U csvtomd + pip3 install -U hanzidentifier clean: cd ../.. && rm -rf temp_shape/* diff --git a/tools/wikidata/README.md b/tools/wikidata/README.md index 616094e8..e771af74 100644 --- a/tools/wikidata/README.md +++ b/tools/wikidata/README.md @@ -6,6 +6,7 @@ pip3 install -U fiona pip3 install -U csvtomd pip3 install -U requests + pip3 install -U hanzidentifier #run from the project root ( expected 30-40 minutes ) # be careful this is running 'make all' @@ -357,34 +358,36 @@ lettercase = lowercase variable names [wikidataid, name_ar, name_bn, name_de, na see the _latest_ information in the `./run_all.sh` -### supported languages ( now: 24) +### supported languages ( now: 26) -variable name | language | language wikipedia link ---------------|--------------|---------------------------------------------------- - NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic - NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language - NAME_DE | German | https://en.wikipedia.org/wiki/German_language - NAME_EN | English | https://en.wikipedia.org/wiki/English_language - NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language - NAME_FR | French | https://en.wikipedia.org/wiki/French_language - NAME_EL | Modern Greek | https://en.wikipedia.org/wiki/Modern_Greek - NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language - NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi - NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language - NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language - NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language - NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language - NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language - NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language - NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language - NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language - NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language - NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language - NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language - NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language - NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu - NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language - NAME_ZH | Chinese | https://en.wikipedia.org/wiki/Chinese_language +variable name | language | language wikipedia link +--------------|-----------------------|---------------------------------------------------- + NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic + NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language + NAME_DE | German | https://en.wikipedia.org/wiki/German_language + NAME_EN | English | https://en.wikipedia.org/wiki/English_language + NAME_EL | Greek (modern) | https://en.wikipedia.org/wiki/Modern_Greek + NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language + NAME_FA | Farsi | https://en.wikipedia.org/wiki/Persian_language + NAME_FR | French | https://en.wikipedia.org/wiki/French_language + NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language + NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi + NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language + NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language + NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language + NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language + NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language + NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language + NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language + NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language + NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language + NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language + NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language + NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language + NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu + NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language + NAME_ZH | Chinese (simplified) | https://en.wikipedia.org/wiki/Chinese_language + NAME_ZHT | Chinese (traditional) | https://en.wikipedia.org/wiki/Traditional_Chinese_characters # Name cleaning diff --git a/tools/wikidata/fetch_wikidata.py b/tools/wikidata/fetch_wikidata.py index fde8435a..c22d5379 100755 --- a/tools/wikidata/fetch_wikidata.py +++ b/tools/wikidata/fetch_wikidata.py @@ -2,6 +2,7 @@ #-- pip3 install -U SPARQLWrapper #-- pip3 install -U fiona +#-- pip3 install -U hanzidentifier """ Fetch Wikidata Labels @@ -20,6 +21,7 @@ import argparse import csv import sys import time +import hanzidentifier #import requests from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions @@ -31,7 +33,7 @@ parser.add_argument('-input_shape_name', default='../../10m_cultural/ne_10m_populated_places.shp', help='input natural-earth shape file - with wikidataid columns') parser.add_argument('-input_lettercase', - default='lowercase', + default='uppercase', help='variables in thes hape file - lowercase or uppercase') parser.add_argument('-output_csv_name', default='ne_10m_populated_places.csv', @@ -67,13 +69,75 @@ def get_sparql_numvalue(sresult, variable_id): val = float(sresult[variable_id]['value']) return val +def post_process_wd_zh(properties): + """ First check whether name_zh (Simplified) and name_zht(Traditional) + are set already, if not we use the name_zh-default to backfill them. + During the backfill, if there is no Simplified Chinese, Traditional + Chinese will be used to further backfill, and vice versa + It also deletes the intermediate property `zh-default` + """ + + name_en_default = properties['name_en'] if 'name_en' in \ + properties else u'' + zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \ + properties else u'' + zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \ + properties else u'' + + # sometimes the default Chinese name has several values in a list + if 'name_zh_default' in properties: + names = properties['name_zh_default'].split('/') + for name in names: + if hanzidentifier.is_simplified(name) and \ + len(zh_Hans_fallback) == 0: + zh_Hans_fallback = name + #print('found simplified name') + if hanzidentifier.is_traditional(name) and \ + len(zh_Hant_fallback) == 0: + zh_Hant_fallback = name + #print('found traditional name') + + # make sure we don't shove English values into Chinese namespace + if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0: + zh_Hans_fallback = u'' + + if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0: + zh_Hant_fallback = u'' + + # now make traditional and simplified Chinese name assignments + if 'name_zhs' not in properties: + if len(zh_Hans_fallback) != 0: + properties['name_zhs'] = zh_Hans_fallback + elif len(zh_Hant_fallback) != 0: + properties['name_zhs'] = zh_Hant_fallback + else: + properties['name_zhs'] = u'' + + if 'name_zht' not in properties: + if len(zh_Hant_fallback) != 0: + properties['name_zht'] = zh_Hant_fallback + elif len(zh_Hans_fallback) != 0: + properties['name_zht'] = zh_Hans_fallback + else: + properties['name_zht'] = u'' + + # only select one of the options if the field is separated by "/" + # for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first + # one 旧金山市县 will be preserved + if len(properties['name_zh']) != 0: + properties['name_zh'] = properties['name_zh'].split('/')[0].strip() + if len(properties['name_zht']) != 0: + properties['name_zht'] = properties['name_zht'].split('/')[0].strip() + + return properties + def fetchwikidata(a_wid): """ Fetch wikidata with SPARQL """ - sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.0 (github.com/nvkelso/natural-earth-vector)') + sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)') query_template = """ SELECT ?e ?i ?r ?population @@ -102,6 +166,8 @@ def fetchwikidata(a_wid): ?name_ur ?name_vi ?name_zh + ?name_zh_hans + ?name_zh_hant WHERE { { SELECT DISTINCT ?e ?i ?r @@ -119,7 +185,7 @@ def fetchwikidata(a_wid): OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").} OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").} OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").} - OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fr))="fa").} + OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").} OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").} OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").} OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").} @@ -138,6 +204,8 @@ def fetchwikidata(a_wid): OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").} OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").} OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").} + OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").} + OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").} } """ @@ -237,7 +305,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f: "name_uk", "name_ur", "name_vi", - "name_zh" + "name_zh", + "name_zht" )) @@ -285,12 +354,13 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f: name_el = get_sparql_label(result, 'name_el') name_en = get_sparql_label(result, 'name_en') name_es = get_sparql_label(result, 'name_es') - name_fr = get_sparql_label(result, 'name_fa') + name_fa = get_sparql_label(result, 'name_fa') name_fr = get_sparql_label(result, 'name_fr') name_he = get_sparql_label(result, 'name_he') name_hi = get_sparql_label(result, 'name_hi') name_hu = get_sparql_label(result, 'name_hu') name_id = get_sparql_label(result, 'name_id') + name_it = get_sparql_label(result, 'name_it') name_ja = get_sparql_label(result, 'name_ja') name_ko = get_sparql_label(result, 'name_ko') name_lt = get_sparql_label(result, 'name_lt') @@ -303,7 +373,44 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f: name_uk = get_sparql_label(result, 'name_uk') name_ur = get_sparql_label(result, 'name_ur') name_vi = get_sparql_label(result, 'name_vi') - name_zh = get_sparql_label(result, 'name_zh') + + # not all Wikidata places have all name (label) translations + try: + name_en = get_sparql_label(result, 'name_en') + except: + name_en = u'' + + try: + name_zh_default = get_sparql_label(result, 'name_zh') + except: + name_zh_default = u'' + + try: + name_zh_hans = get_sparql_label(result, 'name_zh_hans') + except: + name_zh_hans = u'' + + try: + name_zh_hant = get_sparql_label(result, 'name_zh_hant') + except: + name_zh_hant = u'' + + chinese_names = { 'name_en' : name_en, + 'name_zh_default' : name_zh_default, + 'name_zh_hans' : name_zh_hans, + 'name_zh_hant' : name_zh_hant + } + + processed_chinese_names = post_process_wd_zh( chinese_names ) + + try: + name_zh = processed_chinese_names['name_zhs'] + except: + name_zh = u'' + try: + name_zht = processed_chinese_names['name_zht'] + except: + name_zht = u'' writer.writerow(( wd_id, @@ -334,7 +441,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f: name_uk, name_ur, name_vi, - name_zh + name_zh, + name_zht )) print(' - JOB end -') \ No newline at end of file diff --git a/tools/wikidata/write_wikidata.py b/tools/wikidata/write_wikidata.py index 44fbfe9a..62e5a378 100755 --- a/tools/wikidata/write_wikidata.py +++ b/tools/wikidata/write_wikidata.py @@ -44,11 +44,11 @@ parser.add_argument('-output_csvsumlog', args = parser.parse_args() -riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang'+r')\b', +riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang|Sungai'+r')\b', flags=re.IGNORECASE) # Some of these are proper names (Lake of the Ozark's, Clear Lake Reservoir) and # shouldn't be stripped, but in the meantime, strip aggressively -lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ'+r')\b', +lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ|Danau'+r')\b', flags=re.IGNORECASE) #geolabels_regex = re.compile(r'\b('+'(wyspa)'+r')\b', # flags=re.IGNORECASE) @@ -56,6 +56,8 @@ placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation| flags=re.IGNORECASE) geo_region_regex = re.compile(r'\b('+'Região Autónoma dos'+r')\b', flags=re.IGNORECASE) +admin2_regex = re.compile(r'\b('+'County|Condado de|comté de|contea di|comté d|megye|Hrabstwo|Condado de|Quận|مقاطعة|City and Borough|census area di|Census Area|Borough|borough di|borough de|ilçesi'+r')\b', + flags=re.IGNORECASE) admin1_regex = re.compile(r'\b('+'canton of |Canton of|Department|District Council|distretto di contea di|contea di|District de|distretto di|Distretto della|Distrik|distretto del|district|Constitutional Province|Province of |Provincia del|provincia delle|provincia della|provincia di|Provincia Constitucional del|Província Constitucional de|provincia de|Província de|Província do|Provincia de|Préfecture de|Provincia|Comunidad De|Autonome Provinz|Provincia Autónoma de|Provinz|Província|Departamento de|Departamento do|Autonomous Province of|Autonomous Province|Province de la|province de|Province du|Province|Provinsi|Municipality of |Municipality|Município de|Special Region of |Región Metropolitana de|Special Region|Autonomous Region|Capital Region of|Region of|-Region|Region|Governorate|Gouvernorat|Gubernatorstwo|Gobernación de|governatorato di|Capital of|Capital of|City Council|City and Borough of|City of|Città di|City|Región Metropolitana de|Metropolitan Region|Metropolitan Borough of|Metropolitan Borough|Borough Metropolitano de|London Borough of|district londonien|district londonien d\'|borough royal de|Royal Borough of|County Borough|Borough of|Borough Council|Metropoliten Borough|londonien de|district royal de|County|Old Royal Capital|(distrikt)|Distrik|(borough)|Cantão central de|cantone della|(cantão)|(departamento)|(departement)|Región del|Región de|Región|gouvernorat de|Gouvernorat|kormányzóság|regione di|Regione del|Prefectura de|prefettura di|Autonome Oblast|Oblast Autônomo|Autonomous Oblast|Oblast\' dell\'|Obwód Autonomiczny|Kraï de|Kraï|Oblast de|Óblast de|Oblast\' di|Oblast\'|oblast|distrito de|Distrito do|Distrito|métropolitain de|Voivodia da|cantone di|cantone dell\'|Munisipalitas\' di|Munisipalitas|Emirato di|Emirato|cantón del|cantón de|canton du|cantón|Καντόνι του|Καντόνι της|Ζουπανία του|Επαρχία του|Δήμος|Κυβερνείο του|distretto|Région autonome du|région de|Governamento de|Kegubernuran|comté de|parrocchia di|obwód|, London|, Londra|, Nya Zeeland|Daerah Istimewa|Autónoma del|Parish of|Parish|, Barbados|Circondario autonomo dei|Circondario autonomo|circondario autonomo degli|Okręg Autonomiczny|Dystrykt|-Distrikt|Distrikt|distriktet|, प्रांत|, पैराग्वे|, Zambia|, Kenya|, Καμερούν|, Τζαμάικα|, Barbados|, Londra|, Bahama|kommun|Ciudad de|, Gambia|, Botswana|tartomány|körzet|Munizip|division|Conselho do Borough de|Rejon|Raionul|Kotar|megye|Żupania|comune distrettuale di|comune distrettuale|Comune di|comune|Condado de|Condado|Kotamadya|Região Autónoma dos|Região Autónoma|Região|Guvernementet|Gobernación del|Gobernación'+r')\b', flags=re.IGNORECASE) admin0_regex = re.compile(r'\b('+'(district)|(địa hạt)'+r')\b', @@ -75,7 +77,7 @@ wddic = defaultdict(dict) wdredirects = defaultdict(dict) name_field_prefix = 'name_' -languages = ['ar','bn','de','en','es','fr','el','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh'] +languages = ['ar','bn','de','el','en','es','fa','fr','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh','zht'] new_properties = [] with open(args.input_csv, newline='') as csvfile: @@ -94,7 +96,7 @@ with open(args.input_csv, newline='') as csvfile: # as proxy for featureclass # Rivers ... - if args.input_shape.lower().find('river') > 0: + if args.input_shape.lower().find('rivers_lake') > 0: wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d]) # Comma ... @@ -108,7 +110,7 @@ with open(args.input_csv, newline='') as csvfile: wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')] # Lakes ... - if args.input_shape.lower().find('lake') > 0: + if args.input_shape.lower().find('lake') > 0 and args.input_shape.lower().find('10m_physical') > 0: #if d == 'name_en' and wddic[qid]['name_en'] != 'Lake of the Woods': wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] ) @@ -153,6 +155,19 @@ with open(args.input_csv, newline='') as csvfile: # RTL languages and LTR figure each other out in python 3 wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')] + # Admin 2 counties ... + if args.input_shape.lower().find('admin_2') > 0: + wddic[qid][d] = admin2_regex.sub('', wddic[qid][d]) + + # Parenthetical ... + if wddic[qid][d].find('(') > 0: + # RTL languages and LTR figure each other out in python 3 + wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')] + + #name_ko: remove last "주""State" character + if d == 'name_ko' and wddic[qid]['name_ko'] and wddic[qid]['name_ko'][-1] == "군": + wddic[qid]['name_ko'] = wddic[qid]['name_ko'][:-1] + # Admin 1 states, provinces ... if args.input_shape.lower().find('admin_1') > 0: wddic[qid][d] = admin1_regex.sub('', wddic[qid][d])