update Wikidata scripts for Farsi, Chinese simplified, Chinese traditional

2025-06-30 00:02:07 -04:00 · 2021-08-01 10:36:51 -07:00 · 2021-08-01 10:36:51 -07:00 · 04e61779fe
commit 04e61779fe
parent 75feea9848
5 changed files with 173 additions and 43 deletions
--- a/run_all.sh
+++ b/run_all.sh
@ -33,14 +33,18 @@ rm -f $logmd
 #                          | mode       |LetterCase| shape_path  |  shape filename
 # == 10m ================= |=========== |==========| ============| ================================================
 function run10m {
-./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_sovereignty
-./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_countries
+./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_sovereignty                        # this and other admin_0 run, but Mapshaper overwrites them
+./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_countries                          # instead results are copied into housekeeping file's lookup table
 ./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_countries_lakes
 ./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_map_units
 ./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_map_subunits
 ./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_0_disputed_areas
-./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_admin_1_states_provinces
+./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_admin_1_states_provinces                   # this and other admin_1 run, but Mapshaper overwrites them
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_admin_1_states_provinces_lakes
+./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_admin_1_label_points_details               # Mapshaper uses this to generate admin_1 polys
+./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_2_label_points_details               # Mapshaper uses this to generate admin_2 polys
+./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_2_counties                           # this and other admin_2 run, but Mapshaper overwrites them
+./tools/wikidata/update.sh  fetch_write  uppercase   10m_cultural  ne_10m_admin_2_counties_lakes
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_airports
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_populated_places                           # this should be build before derived Makefile themes run
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_physical  ne_10m_geographic_lines                           # this should be build before derived Makefile themes run
@ -57,7 +61,6 @@ function run10m {
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_physical  ne_10m_rivers_lake_centerlines_scale_rank
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_physical  ne_10m_rivers_europe
 ./tools/wikidata/update.sh  fetch_write  lowercase   10m_physical  ne_10m_rivers_north_america
-#./tools/wikidata/update.sh  fetch_write  lowercase   10m_cultural  ne_10m_admin_1_label_points_details                # geometry errors
 }

 function run50m {
--- a/tools/wikidata/Makefile
+++ b/tools/wikidata/Makefile
@ -6,6 +6,7 @@ install:
 	pip3 install -U SPARQLWrapper
 	pip3 install -U fiona
 	pip3 install -U csvtomd
+	pip3 install -U hanzidentifier

 clean:
 	cd ../.. && rm -rf temp_shape/*
--- a/tools/wikidata/README.md
+++ b/tools/wikidata/README.md
@ -6,6 +6,7 @@
   pip3 install -U fiona
   pip3 install -U csvtomd
   pip3 install -U requests
+   pip3 install -U hanzidentifier

 #run from the project root ( expected 30-40 minutes )
 # be careful this is running  'make all'
@ -357,34 +358,36 @@ lettercase = lowercase variable names [wikidataid, name_ar, name_bn, name_de, na
 see the _latest_  information in the `./run_all.sh`


-### supported languages ( now: 24)
+### supported languages ( now: 26)

-variable name | language     | language wikipedia link
--------------|--------------|----------------------------------------------------
- NAME_AR 	  | Arabic       | https://en.wikipedia.org/wiki/Arabic
- NAME_BN 	  | Bengali      | https://en.wikipedia.org/wiki/Bengali_language
- NAME_DE 	  | German       | https://en.wikipedia.org/wiki/German_language
- NAME_EN 	  | English      | https://en.wikipedia.org/wiki/English_language
- NAME_ES 	  | Spanish      | https://en.wikipedia.org/wiki/Spanish_language
- NAME_FR 	  | French       | https://en.wikipedia.org/wiki/French_language
- NAME_EL 	  | Modern Greek | https://en.wikipedia.org/wiki/Modern_Greek
- NAME_HE 	  | Hebrew     	 | https://en.wikipedia.org/wiki/Hebrew_language
- NAME_HI 	  | Hindi     	 | https://en.wikipedia.org/wiki/Hindi
- NAME_HU 	  | Hungarian 	 | https://en.wikipedia.org/wiki/Hungarian_language
- NAME_ID 	  | Indonesian   | https://en.wikipedia.org/wiki/Indonesian_language
- NAME_IT 	  | Italian      | https://en.wikipedia.org/wiki/Italian_language
- NAME_JA 	  | Japanese     | https://en.wikipedia.org/wiki/Japanese_language
- NAME_KO 	  | Korean       | https://en.wikipedia.org/wiki/Korean_language
- NAME_NL 	  | Dutch        | https://en.wikipedia.org/wiki/Dutch_language
- NAME_PL 	  | Polish       | https://en.wikipedia.org/wiki/Polish_language
- NAME_PT 	  | Portuguese   | https://en.wikipedia.org/wiki/Portuguese_language
- NAME_RU 	  | Russian      | https://en.wikipedia.org/wiki/Russian_language
- NAME_SV 	  | Swedish      | https://en.wikipedia.org/wiki/Swedish_language
- NAME_TR 	  | Turkish      | https://en.wikipedia.org/wiki/Turkish_language
- NAME_UK 	  | Ukrainian    | https://en.wikipedia.org/wiki/Ukrainian_language
- NAME_UR 	  | Urdu         | https://en.wikipedia.org/wiki/Urdu
- NAME_VI 	  | Vietnamese   | https://en.wikipedia.org/wiki/Vietnamese_language
- NAME_ZH 	  | Chinese      | https://en.wikipedia.org/wiki/Chinese_language
+variable name | language              | language wikipedia link
+--------------|-----------------------|----------------------------------------------------
+ NAME_AR 	  | Arabic                | https://en.wikipedia.org/wiki/Arabic
+ NAME_BN 	  | Bengali               | https://en.wikipedia.org/wiki/Bengali_language
+ NAME_DE 	  | German                | https://en.wikipedia.org/wiki/German_language
+ NAME_EN 	  | English               | https://en.wikipedia.org/wiki/English_language
+ NAME_EL 	  | Greek (modern)        | https://en.wikipedia.org/wiki/Modern_Greek
+ NAME_ES 	  | Spanish               | https://en.wikipedia.org/wiki/Spanish_language
+ NAME_FA 	  | Farsi                 | https://en.wikipedia.org/wiki/Persian_language
+ NAME_FR 	  | French                | https://en.wikipedia.org/wiki/French_language
+ NAME_HE 	  | Hebrew     	          | https://en.wikipedia.org/wiki/Hebrew_language
+ NAME_HI 	  | Hindi     	          | https://en.wikipedia.org/wiki/Hindi
+ NAME_HU 	  | Hungarian 	          | https://en.wikipedia.org/wiki/Hungarian_language
+ NAME_ID 	  | Indonesian            | https://en.wikipedia.org/wiki/Indonesian_language
+ NAME_IT 	  | Italian               | https://en.wikipedia.org/wiki/Italian_language
+ NAME_JA 	  | Japanese              | https://en.wikipedia.org/wiki/Japanese_language
+ NAME_KO 	  | Korean                | https://en.wikipedia.org/wiki/Korean_language
+ NAME_NL 	  | Dutch                 | https://en.wikipedia.org/wiki/Dutch_language
+ NAME_PL 	  | Polish                | https://en.wikipedia.org/wiki/Polish_language
+ NAME_PT 	  | Portuguese            | https://en.wikipedia.org/wiki/Portuguese_language
+ NAME_RU 	  | Russian               | https://en.wikipedia.org/wiki/Russian_language
+ NAME_SV 	  | Swedish               | https://en.wikipedia.org/wiki/Swedish_language
+ NAME_TR 	  | Turkish               | https://en.wikipedia.org/wiki/Turkish_language
+ NAME_UK 	  | Ukrainian             | https://en.wikipedia.org/wiki/Ukrainian_language
+ NAME_UR 	  | Urdu                  | https://en.wikipedia.org/wiki/Urdu
+ NAME_VI 	  | Vietnamese            | https://en.wikipedia.org/wiki/Vietnamese_language
+ NAME_ZH 	  | Chinese (simplified)  | https://en.wikipedia.org/wiki/Chinese_language
+ NAME_ZHT 	  | Chinese (traditional) | https://en.wikipedia.org/wiki/Traditional_Chinese_characters

 # Name cleaning

--- a/tools/wikidata/fetch_wikidata.py
+++ b/tools/wikidata/fetch_wikidata.py
@ -2,6 +2,7 @@

 #--  pip3 install -U SPARQLWrapper
 #--  pip3 install -U fiona
+#--  pip3 install -U hanzidentifier

 """
 Fetch Wikidata Labels
@ -20,6 +21,7 @@ import argparse
 import csv
 import sys
 import time
+import hanzidentifier
 #import requests

 from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
@ -31,7 +33,7 @@ parser.add_argument('-input_shape_name',
                    default='../../10m_cultural/ne_10m_populated_places.shp',
                    help='input natural-earth shape file - with wikidataid columns')
 parser.add_argument('-input_lettercase',
-                    default='lowercase',
+                    default='uppercase',
                    help='variables in thes hape file - lowercase or uppercase')
 parser.add_argument('-output_csv_name',
                    default='ne_10m_populated_places.csv',
@ -67,13 +69,75 @@ def get_sparql_numvalue(sresult, variable_id):
        val = float(sresult[variable_id]['value'])
    return val

+def post_process_wd_zh(properties):
+    """ First check whether name_zh (Simplified) and name_zht(Traditional)
+    are set already, if not we use the name_zh-default to backfill them.
+    During the backfill, if there is no Simplified Chinese, Traditional
+    Chinese will be used to further backfill, and vice versa
+    It also deletes the intermediate property `zh-default`
+    """
+
+    name_en_default = properties['name_en'] if 'name_en' in \
+                                                properties else u''
+    zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
+                                                properties else u''
+    zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
+                                                 properties else u''
+
+    # sometimes the default Chinese name has several values in a list
+    if 'name_zh_default' in properties:
+        names = properties['name_zh_default'].split('/')
+        for name in names:
+            if hanzidentifier.is_simplified(name) and \
+                    len(zh_Hans_fallback) == 0:
+                zh_Hans_fallback = name
+                #print('found simplified name')
+            if hanzidentifier.is_traditional(name) and \
+                    len(zh_Hant_fallback) == 0:
+                zh_Hant_fallback = name
+                #print('found traditional name')
+
+    # make sure we don't shove English values into Chinese namespace
+    if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
+        zh_Hans_fallback = u''
+
+    if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
+        zh_Hant_fallback = u''
+
+    # now make traditional and simplified Chinese name assignments
+    if 'name_zhs' not in properties:
+        if len(zh_Hans_fallback) != 0:
+            properties['name_zhs'] = zh_Hans_fallback
+        elif len(zh_Hant_fallback) != 0:
+            properties['name_zhs'] = zh_Hant_fallback
+        else:
+            properties['name_zhs'] = u''
+
+    if 'name_zht' not in properties:
+        if len(zh_Hant_fallback) != 0:
+            properties['name_zht'] = zh_Hant_fallback
+        elif len(zh_Hans_fallback) != 0:
+            properties['name_zht'] = zh_Hans_fallback
+        else:
+            properties['name_zht'] = u''
+
+    # only select one of the options if the field is separated by "/"
+    # for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
+    # one 旧金山市县 will be preserved
+    if len(properties['name_zh']) != 0:
+        properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
+    if len(properties['name_zht']) != 0:
+        properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
+
+    return properties
+

 def fetchwikidata(a_wid):
    """
    Fetch wikidata with SPARQL
    """

-    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.0 (github.com/nvkelso/natural-earth-vector)')
+    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
    query_template = """
        SELECT
            ?e ?i ?r ?population
@ -102,6 +166,8 @@ def fetchwikidata(a_wid):
            ?name_ur
            ?name_vi
            ?name_zh
+            ?name_zh_hans
+            ?name_zh_hant
        WHERE {
            {
                SELECT DISTINCT  ?e ?i ?r
@ -119,7 +185,7 @@ def fetchwikidata(a_wid):
            OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
            OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
            OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
-            OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fr))="fa").}
+            OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
            OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
            OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
            OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
@ -138,6 +204,8 @@ def fetchwikidata(a_wid):
            OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
            OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
            OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
+            OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
+            OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
        }

    """
@ -237,7 +305,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
        "name_uk",
        "name_ur",
        "name_vi",
-        "name_zh"
+        "name_zh",
+        "name_zht"
    ))


@ -285,12 +354,13 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
                    name_el = get_sparql_label(result, 'name_el')
                    name_en = get_sparql_label(result, 'name_en')
                    name_es = get_sparql_label(result, 'name_es')
-                    name_fr = get_sparql_label(result, 'name_fa')
+                    name_fa = get_sparql_label(result, 'name_fa')
                    name_fr = get_sparql_label(result, 'name_fr')
                    name_he = get_sparql_label(result, 'name_he')
                    name_hi = get_sparql_label(result, 'name_hi')
                    name_hu = get_sparql_label(result, 'name_hu')
                    name_id = get_sparql_label(result, 'name_id')
+                    name_it = get_sparql_label(result, 'name_it')
                    name_ja = get_sparql_label(result, 'name_ja')
                    name_ko = get_sparql_label(result, 'name_ko')
                    name_lt = get_sparql_label(result, 'name_lt')
@ -303,7 +373,44 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
                    name_uk = get_sparql_label(result, 'name_uk')
                    name_ur = get_sparql_label(result, 'name_ur')
                    name_vi = get_sparql_label(result, 'name_vi')
-                    name_zh = get_sparql_label(result, 'name_zh')
+
+                    # not all Wikidata places have all name (label) translations
+                    try:
+                        name_en = get_sparql_label(result, 'name_en')
+                    except:
+                        name_en = u''
+
+                    try:
+                        name_zh_default = get_sparql_label(result, 'name_zh')
+                    except:
+                        name_zh_default = u''
+
+                    try:
+                        name_zh_hans = get_sparql_label(result, 'name_zh_hans')
+                    except:
+                        name_zh_hans = u''
+
+                    try:
+                        name_zh_hant = get_sparql_label(result, 'name_zh_hant')
+                    except:
+                        name_zh_hant = u''
+
+                    chinese_names = { 'name_en'         : name_en,
+                                      'name_zh_default' : name_zh_default,
+                                      'name_zh_hans'    : name_zh_hans,
+                                      'name_zh_hant'    : name_zh_hant
+                                    }
+
+                    processed_chinese_names = post_process_wd_zh( chinese_names )
+
+                    try:
+                        name_zh  = processed_chinese_names['name_zhs']
+                    except:
+                        name_zh  = u''
+                    try:
+                        name_zht = processed_chinese_names['name_zht']
+                    except:
+                        name_zht  = u''

                    writer.writerow((
                        wd_id,
@ -334,7 +441,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
                        name_uk,
                        name_ur,
                        name_vi,
-                        name_zh
+                        name_zh,
+                        name_zht
                        ))

 print(' - JOB end -')
--- a/tools/wikidata/write_wikidata.py
+++ b/tools/wikidata/write_wikidata.py
@ -44,11 +44,11 @@ parser.add_argument('-output_csvsumlog',

 args = parser.parse_args()

-riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang'+r')\b',
+riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang|Sungai'+r')\b',
                              flags=re.IGNORECASE)
 # Some of these are proper names (Lake of the Ozark's, Clear Lake Reservoir) and
 # shouldn't be stripped, but in the meantime, strip aggressively
-lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ'+r')\b',
+lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ|Danau'+r')\b',
                              flags=re.IGNORECASE)
 #geolabels_regex = re.compile(r'\b('+'(wyspa)'+r')\b',
 #                              flags=re.IGNORECASE)
@ -56,6 +56,8 @@ placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation|
                              flags=re.IGNORECASE)
 geo_region_regex = re.compile(r'\b('+'Região Autónoma dos'+r')\b',
                              flags=re.IGNORECASE)
+admin2_regex = re.compile(r'\b('+'County|Condado de|comté de|contea di|comté d|megye|Hrabstwo|Condado de|Quận|مقاطعة|City and Borough|census area di|Census Area|Borough|borough di|borough de|ilçesi'+r')\b',
+                              flags=re.IGNORECASE)
 admin1_regex = re.compile(r'\b('+'canton of |Canton of|Department|District Council|distretto di contea di|contea di|District de|distretto di|Distretto della|Distrik|distretto del|district|Constitutional Province|Province of |Provincia del|provincia delle|provincia della|provincia di|Provincia Constitucional del|Província Constitucional de|provincia de|Província de|Província do|Provincia de|Préfecture de|Provincia|Comunidad De|Autonome Provinz|Provincia Autónoma de|Provinz|Província|Departamento de|Departamento do|Autonomous Province of|Autonomous Province|Province de la|province de|Province du|Province|Provinsi|Municipality of |Municipality|Município de|Special Region of |Región Metropolitana de|Special Region|Autonomous Region|Capital Region of|Region of|-Region|Region|Governorate|Gouvernorat|Gubernatorstwo|Gobernación de|governatorato di|Capital  of|Capital of|City Council|City and Borough of|City of|Città di|City|Región Metropolitana de|Metropolitan Region|Metropolitan Borough of|Metropolitan Borough|Borough Metropolitano de|London Borough of|district londonien|district londonien d\'|borough royal de|Royal Borough of|County Borough|Borough of|Borough Council|Metropoliten Borough|londonien de|district royal de|County|Old Royal Capital|(distrikt)|Distrik|(borough)|Cantão central de|cantone della|(cantão)|(departamento)|(departement)|Región del|Región de|Región|gouvernorat de|Gouvernorat|kormányzóság|regione di|Regione del|Prefectura de|prefettura di|Autonome Oblast|Oblast Autônomo|Autonomous Oblast|Oblast\' dell\'|Obwód Autonomiczny|Kraï de|Kraï|Oblast de|Óblast de|Oblast\' di|Oblast\'|oblast|distrito de|Distrito do|Distrito|métropolitain de|Voivodia da|cantone di|cantone dell\'|Munisipalitas\' di|Munisipalitas|Emirato di|Emirato|cantón del|cantón de|canton du|cantón|Καντόνι του|Καντόνι της|Ζουπανία του|Επαρχία του|Δήμος|Κυβερνείο του|distretto|Région autonome du|région de|Governamento de|Kegubernuran|comté de|parrocchia di|obwód|, London|, Londra|, Nya Zeeland|Daerah Istimewa|Autónoma del|Parish of|Parish|, Barbados|Circondario autonomo dei|Circondario autonomo|circondario autonomo degli|Okręg Autonomiczny|Dystrykt|-Distrikt|Distrikt|distriktet|, प्रांत|, पैराग्वे|, Zambia|, Kenya|, Καμερούν|, Τζαμάικα|, Barbados|, Londra|, Bahama|kommun|Ciudad de|, Gambia|, Botswana|tartomány|körzet|Munizip|division|Conselho do Borough de|Rejon|Raionul|Kotar|megye|Żupania|comune distrettuale di|comune distrettuale|Comune di|comune|Condado de|Condado|Kotamadya|Região Autónoma dos|Região Autónoma|Região|Guvernementet|Gobernación del|Gobernación'+r')\b',
                              flags=re.IGNORECASE)
 admin0_regex = re.compile(r'\b('+'(district)|(địa hạt)'+r')\b',
@ -75,7 +77,7 @@ wddic = defaultdict(dict)
 wdredirects = defaultdict(dict)

 name_field_prefix = 'name_'
-languages = ['ar','bn','de','en','es','fr','el','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh']
+languages = ['ar','bn','de','el','en','es','fa','fr','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh','zht']
 new_properties = []

 with open(args.input_csv, newline='') as csvfile:
@ -94,7 +96,7 @@ with open(args.input_csv, newline='') as csvfile:
                    # as proxy for featureclass

                    # Rivers ...
-                    if args.input_shape.lower().find('river') > 0:
+                    if args.input_shape.lower().find('rivers_lake') > 0:
                        wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d])

                        # Comma ...
@ -108,7 +110,7 @@ with open(args.input_csv, newline='') as csvfile:
                            wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]

                    # Lakes ...
-                    if args.input_shape.lower().find('lake') > 0:
+                    if args.input_shape.lower().find('lake') > 0 and args.input_shape.lower().find('10m_physical') > 0:
                        #if d == 'name_en' and wddic[qid]['name_en'] != 'Lake of the Woods':
                        wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] )

@ -153,6 +155,19 @@ with open(args.input_csv, newline='') as csvfile:
                            # RTL languages and LTR figure each other out in python 3
                            wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]

+                    # Admin 2 counties ...
+                    if args.input_shape.lower().find('admin_2') > 0:
+                        wddic[qid][d] = admin2_regex.sub('', wddic[qid][d])
+
+                        # Parenthetical ...
+                        if wddic[qid][d].find('(') > 0:
+                            # RTL languages and LTR figure each other out in python 3
+                            wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
+
+                        #name_ko:  remove last "주""State" character
+                        if d == 'name_ko' and wddic[qid]['name_ko'] and wddic[qid]['name_ko'][-1] == "군":
+                            wddic[qid]['name_ko'] = wddic[qid]['name_ko'][:-1]
+
                    # Admin 1 states, provinces ...
                    if args.input_shape.lower().find('admin_1') > 0:
                        wddic[qid][d] = admin1_regex.sub('', wddic[qid][d])