update Wikidata scripts for Farsi, Chinese simplified, Chinese traditional

This commit is contained in:
Nathaniel Kelso 2021-08-01 10:36:51 -07:00
parent 75feea9848
commit 04e61779fe
5 changed files with 173 additions and 43 deletions

View File

@ -33,14 +33,18 @@ rm -f $logmd
# | mode |LetterCase| shape_path | shape filename # | mode |LetterCase| shape_path | shape filename
# == 10m ================= |=========== |==========| ============| ================================================ # == 10m ================= |=========== |==========| ============| ================================================
function run10m { function run10m {
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty # this and other admin_0 run, but Mapshaper overwrites them
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries # instead results are copied into housekeeping file's lookup table
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries_lakes ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries_lakes
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_units ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_units
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_subunits ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_subunits
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_disputed_areas ./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_disputed_areas
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces # this and other admin_1 run, but Mapshaper overwrites them
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # Mapshaper uses this to generate admin_1 polys
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_label_points_details # Mapshaper uses this to generate admin_2 polys
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties # this and other admin_2 run, but Mapshaper overwrites them
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties_lakes
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_airports ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_airports
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_populated_places # this should be build before derived Makefile themes run ./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_populated_places # this should be build before derived Makefile themes run
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geographic_lines # this should be build before derived Makefile themes run ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geographic_lines # this should be build before derived Makefile themes run
@ -57,7 +61,6 @@ function run10m {
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_europe ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_europe
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_north_america ./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_north_america
#./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # geometry errors
} }
function run50m { function run50m {

View File

@ -6,6 +6,7 @@ install:
pip3 install -U SPARQLWrapper pip3 install -U SPARQLWrapper
pip3 install -U fiona pip3 install -U fiona
pip3 install -U csvtomd pip3 install -U csvtomd
pip3 install -U hanzidentifier
clean: clean:
cd ../.. && rm -rf temp_shape/* cd ../.. && rm -rf temp_shape/*

View File

@ -6,6 +6,7 @@
pip3 install -U fiona pip3 install -U fiona
pip3 install -U csvtomd pip3 install -U csvtomd
pip3 install -U requests pip3 install -U requests
pip3 install -U hanzidentifier
#run from the project root ( expected 30-40 minutes ) #run from the project root ( expected 30-40 minutes )
# be careful this is running 'make all' # be careful this is running 'make all'
@ -357,34 +358,36 @@ lettercase = lowercase variable names [wikidataid, name_ar, name_bn, name_de, na
see the _latest_ information in the `./run_all.sh` see the _latest_ information in the `./run_all.sh`
### supported languages ( now: 24) ### supported languages ( now: 26)
variable name | language | language wikipedia link variable name | language | language wikipedia link
--------------|--------------|---------------------------------------------------- --------------|-----------------------|----------------------------------------------------
NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic
NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language
NAME_DE | German | https://en.wikipedia.org/wiki/German_language NAME_DE | German | https://en.wikipedia.org/wiki/German_language
NAME_EN | English | https://en.wikipedia.org/wiki/English_language NAME_EN | English | https://en.wikipedia.org/wiki/English_language
NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language NAME_EL | Greek (modern) | https://en.wikipedia.org/wiki/Modern_Greek
NAME_FR | French | https://en.wikipedia.org/wiki/French_language NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language
NAME_EL | Modern Greek | https://en.wikipedia.org/wiki/Modern_Greek NAME_FA | Farsi | https://en.wikipedia.org/wiki/Persian_language
NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language NAME_FR | French | https://en.wikipedia.org/wiki/French_language
NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language
NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi
NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language
NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language
NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language
NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language
NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language
NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language
NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language
NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language
NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language
NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language
NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language
NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language
NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu
NAME_ZH | Chinese | https://en.wikipedia.org/wiki/Chinese_language NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language
NAME_ZH | Chinese (simplified) | https://en.wikipedia.org/wiki/Chinese_language
NAME_ZHT | Chinese (traditional) | https://en.wikipedia.org/wiki/Traditional_Chinese_characters
# Name cleaning # Name cleaning

View File

@ -2,6 +2,7 @@
#-- pip3 install -U SPARQLWrapper #-- pip3 install -U SPARQLWrapper
#-- pip3 install -U fiona #-- pip3 install -U fiona
#-- pip3 install -U hanzidentifier
""" """
Fetch Wikidata Labels Fetch Wikidata Labels
@ -20,6 +21,7 @@ import argparse
import csv import csv
import sys import sys
import time import time
import hanzidentifier
#import requests #import requests
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
@ -31,7 +33,7 @@ parser.add_argument('-input_shape_name',
default='../../10m_cultural/ne_10m_populated_places.shp', default='../../10m_cultural/ne_10m_populated_places.shp',
help='input natural-earth shape file - with wikidataid columns') help='input natural-earth shape file - with wikidataid columns')
parser.add_argument('-input_lettercase', parser.add_argument('-input_lettercase',
default='lowercase', default='uppercase',
help='variables in thes hape file - lowercase or uppercase') help='variables in thes hape file - lowercase or uppercase')
parser.add_argument('-output_csv_name', parser.add_argument('-output_csv_name',
default='ne_10m_populated_places.csv', default='ne_10m_populated_places.csv',
@ -67,13 +69,75 @@ def get_sparql_numvalue(sresult, variable_id):
val = float(sresult[variable_id]['value']) val = float(sresult[variable_id]['value'])
return val return val
def post_process_wd_zh(properties):
""" First check whether name_zh (Simplified) and name_zht(Traditional)
are set already, if not we use the name_zh-default to backfill them.
During the backfill, if there is no Simplified Chinese, Traditional
Chinese will be used to further backfill, and vice versa
It also deletes the intermediate property `zh-default`
"""
name_en_default = properties['name_en'] if 'name_en' in \
properties else u''
zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
properties else u''
zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
properties else u''
# sometimes the default Chinese name has several values in a list
if 'name_zh_default' in properties:
names = properties['name_zh_default'].split('/')
for name in names:
if hanzidentifier.is_simplified(name) and \
len(zh_Hans_fallback) == 0:
zh_Hans_fallback = name
#print('found simplified name')
if hanzidentifier.is_traditional(name) and \
len(zh_Hant_fallback) == 0:
zh_Hant_fallback = name
#print('found traditional name')
# make sure we don't shove English values into Chinese namespace
if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
zh_Hans_fallback = u''
if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
zh_Hant_fallback = u''
# now make traditional and simplified Chinese name assignments
if 'name_zhs' not in properties:
if len(zh_Hans_fallback) != 0:
properties['name_zhs'] = zh_Hans_fallback
elif len(zh_Hant_fallback) != 0:
properties['name_zhs'] = zh_Hant_fallback
else:
properties['name_zhs'] = u''
if 'name_zht' not in properties:
if len(zh_Hant_fallback) != 0:
properties['name_zht'] = zh_Hant_fallback
elif len(zh_Hans_fallback) != 0:
properties['name_zht'] = zh_Hans_fallback
else:
properties['name_zht'] = u''
# only select one of the options if the field is separated by "/"
# for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
# one 旧金山市县 will be preserved
if len(properties['name_zh']) != 0:
properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
if len(properties['name_zht']) != 0:
properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
return properties
def fetchwikidata(a_wid): def fetchwikidata(a_wid):
""" """
Fetch wikidata with SPARQL Fetch wikidata with SPARQL
""" """
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.0 (github.com/nvkelso/natural-earth-vector)') sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
query_template = """ query_template = """
SELECT SELECT
?e ?i ?r ?population ?e ?i ?r ?population
@ -102,6 +166,8 @@ def fetchwikidata(a_wid):
?name_ur ?name_ur
?name_vi ?name_vi
?name_zh ?name_zh
?name_zh_hans
?name_zh_hant
WHERE { WHERE {
{ {
SELECT DISTINCT ?e ?i ?r SELECT DISTINCT ?e ?i ?r
@ -119,7 +185,7 @@ def fetchwikidata(a_wid):
OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").} OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").} OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").} OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fr))="fa").} OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").} OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").} OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").} OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
@ -138,6 +204,8 @@ def fetchwikidata(a_wid):
OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").} OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").} OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").} OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
} }
""" """
@ -237,7 +305,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
"name_uk", "name_uk",
"name_ur", "name_ur",
"name_vi", "name_vi",
"name_zh" "name_zh",
"name_zht"
)) ))
@ -285,12 +354,13 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
name_el = get_sparql_label(result, 'name_el') name_el = get_sparql_label(result, 'name_el')
name_en = get_sparql_label(result, 'name_en') name_en = get_sparql_label(result, 'name_en')
name_es = get_sparql_label(result, 'name_es') name_es = get_sparql_label(result, 'name_es')
name_fr = get_sparql_label(result, 'name_fa') name_fa = get_sparql_label(result, 'name_fa')
name_fr = get_sparql_label(result, 'name_fr') name_fr = get_sparql_label(result, 'name_fr')
name_he = get_sparql_label(result, 'name_he') name_he = get_sparql_label(result, 'name_he')
name_hi = get_sparql_label(result, 'name_hi') name_hi = get_sparql_label(result, 'name_hi')
name_hu = get_sparql_label(result, 'name_hu') name_hu = get_sparql_label(result, 'name_hu')
name_id = get_sparql_label(result, 'name_id') name_id = get_sparql_label(result, 'name_id')
name_it = get_sparql_label(result, 'name_it')
name_ja = get_sparql_label(result, 'name_ja') name_ja = get_sparql_label(result, 'name_ja')
name_ko = get_sparql_label(result, 'name_ko') name_ko = get_sparql_label(result, 'name_ko')
name_lt = get_sparql_label(result, 'name_lt') name_lt = get_sparql_label(result, 'name_lt')
@ -303,7 +373,44 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
name_uk = get_sparql_label(result, 'name_uk') name_uk = get_sparql_label(result, 'name_uk')
name_ur = get_sparql_label(result, 'name_ur') name_ur = get_sparql_label(result, 'name_ur')
name_vi = get_sparql_label(result, 'name_vi') name_vi = get_sparql_label(result, 'name_vi')
name_zh = get_sparql_label(result, 'name_zh')
# not all Wikidata places have all name (label) translations
try:
name_en = get_sparql_label(result, 'name_en')
except:
name_en = u''
try:
name_zh_default = get_sparql_label(result, 'name_zh')
except:
name_zh_default = u''
try:
name_zh_hans = get_sparql_label(result, 'name_zh_hans')
except:
name_zh_hans = u''
try:
name_zh_hant = get_sparql_label(result, 'name_zh_hant')
except:
name_zh_hant = u''
chinese_names = { 'name_en' : name_en,
'name_zh_default' : name_zh_default,
'name_zh_hans' : name_zh_hans,
'name_zh_hant' : name_zh_hant
}
processed_chinese_names = post_process_wd_zh( chinese_names )
try:
name_zh = processed_chinese_names['name_zhs']
except:
name_zh = u''
try:
name_zht = processed_chinese_names['name_zht']
except:
name_zht = u''
writer.writerow(( writer.writerow((
wd_id, wd_id,
@ -334,7 +441,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
name_uk, name_uk,
name_ur, name_ur,
name_vi, name_vi,
name_zh name_zh,
name_zht
)) ))
print(' - JOB end -') print(' - JOB end -')

View File

@ -44,11 +44,11 @@ parser.add_argument('-output_csvsumlog',
args = parser.parse_args() args = parser.parse_args()
riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang'+r')\b', riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang|Sungai'+r')\b',
flags=re.IGNORECASE) flags=re.IGNORECASE)
# Some of these are proper names (Lake of the Ozark's, Clear Lake Reservoir) and # Some of these are proper names (Lake of the Ozark's, Clear Lake Reservoir) and
# shouldn't be stripped, but in the meantime, strip aggressively # shouldn't be stripped, but in the meantime, strip aggressively
lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ'+r')\b', lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ|Danau'+r')\b',
flags=re.IGNORECASE) flags=re.IGNORECASE)
#geolabels_regex = re.compile(r'\b('+'(wyspa)'+r')\b', #geolabels_regex = re.compile(r'\b('+'(wyspa)'+r')\b',
# flags=re.IGNORECASE) # flags=re.IGNORECASE)
@ -56,6 +56,8 @@ placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation|
flags=re.IGNORECASE) flags=re.IGNORECASE)
geo_region_regex = re.compile(r'\b('+'Região Autónoma dos'+r')\b', geo_region_regex = re.compile(r'\b('+'Região Autónoma dos'+r')\b',
flags=re.IGNORECASE) flags=re.IGNORECASE)
admin2_regex = re.compile(r'\b('+'County|Condado de|comté de|contea di|comté d|megye|Hrabstwo|Condado de|Quận|مقاطعة|City and Borough|census area di|Census Area|Borough|borough di|borough de|ilçesi'+r')\b',
flags=re.IGNORECASE)
admin1_regex = re.compile(r'\b('+'canton of |Canton of|Department|District Council|distretto di contea di|contea di|District de|distretto di|Distretto della|Distrik|distretto del|district|Constitutional Province|Province of |Provincia del|provincia delle|provincia della|provincia di|Provincia Constitucional del|Província Constitucional de|provincia de|Província de|Província do|Provincia de|Préfecture de|Provincia|Comunidad De|Autonome Provinz|Provincia Autónoma de|Provinz|Província|Departamento de|Departamento do|Autonomous Province of|Autonomous Province|Province de la|province de|Province du|Province|Provinsi|Municipality of |Municipality|Município de|Special Region of |Región Metropolitana de|Special Region|Autonomous Region|Capital Region of|Region of|-Region|Region|Governorate|Gouvernorat|Gubernatorstwo|Gobernación de|governatorato di|Capital of|Capital of|City Council|City and Borough of|City of|Città di|City|Región Metropolitana de|Metropolitan Region|Metropolitan Borough of|Metropolitan Borough|Borough Metropolitano de|London Borough of|district londonien|district londonien d\'|borough royal de|Royal Borough of|County Borough|Borough of|Borough Council|Metropoliten Borough|londonien de|district royal de|County|Old Royal Capital|(distrikt)|Distrik|(borough)|Cantão central de|cantone della|(cantão)|(departamento)|(departement)|Región del|Región de|Región|gouvernorat de|Gouvernorat|kormányzóság|regione di|Regione del|Prefectura de|prefettura di|Autonome Oblast|Oblast Autônomo|Autonomous Oblast|Oblast\' dell\'|Obwód Autonomiczny|Kraï de|Kraï|Oblast de|Óblast de|Oblast\' di|Oblast\'|oblast|distrito de|Distrito do|Distrito|métropolitain de|Voivodia da|cantone di|cantone dell\'|Munisipalitas\' di|Munisipalitas|Emirato di|Emirato|cantón del|cantón de|canton du|cantón|Καντόνι του|Καντόνι της|Ζουπανία του|Επαρχία του|Δήμος|Κυβερνείο του|distretto|Région autonome du|région de|Governamento de|Kegubernuran|comté de|parrocchia di|obwód|, London|, Londra|, Nya Zeeland|Daerah Istimewa|Autónoma del|Parish of|Parish|, Barbados|Circondario autonomo dei|Circondario autonomo|circondario autonomo degli|Okręg Autonomiczny|Dystrykt|-Distrikt|Distrikt|distriktet|, प्रांत|, पैराग्वे|, Zambia|, Kenya|, Καμερούν|, Τζαμάικα|, Barbados|, Londra|, Bahama|kommun|Ciudad de|, Gambia|, Botswana|tartomány|körzet|Munizip|division|Conselho do Borough de|Rejon|Raionul|Kotar|megye|Żupania|comune distrettuale di|comune distrettuale|Comune di|comune|Condado de|Condado|Kotamadya|Região Autónoma dos|Região Autónoma|Região|Guvernementet|Gobernación del|Gobernación'+r')\b', admin1_regex = re.compile(r'\b('+'canton of |Canton of|Department|District Council|distretto di contea di|contea di|District de|distretto di|Distretto della|Distrik|distretto del|district|Constitutional Province|Province of |Provincia del|provincia delle|provincia della|provincia di|Provincia Constitucional del|Província Constitucional de|provincia de|Província de|Província do|Provincia de|Préfecture de|Provincia|Comunidad De|Autonome Provinz|Provincia Autónoma de|Provinz|Província|Departamento de|Departamento do|Autonomous Province of|Autonomous Province|Province de la|province de|Province du|Province|Provinsi|Municipality of |Municipality|Município de|Special Region of |Región Metropolitana de|Special Region|Autonomous Region|Capital Region of|Region of|-Region|Region|Governorate|Gouvernorat|Gubernatorstwo|Gobernación de|governatorato di|Capital of|Capital of|City Council|City and Borough of|City of|Città di|City|Región Metropolitana de|Metropolitan Region|Metropolitan Borough of|Metropolitan Borough|Borough Metropolitano de|London Borough of|district londonien|district londonien d\'|borough royal de|Royal Borough of|County Borough|Borough of|Borough Council|Metropoliten Borough|londonien de|district royal de|County|Old Royal Capital|(distrikt)|Distrik|(borough)|Cantão central de|cantone della|(cantão)|(departamento)|(departement)|Región del|Región de|Región|gouvernorat de|Gouvernorat|kormányzóság|regione di|Regione del|Prefectura de|prefettura di|Autonome Oblast|Oblast Autônomo|Autonomous Oblast|Oblast\' dell\'|Obwód Autonomiczny|Kraï de|Kraï|Oblast de|Óblast de|Oblast\' di|Oblast\'|oblast|distrito de|Distrito do|Distrito|métropolitain de|Voivodia da|cantone di|cantone dell\'|Munisipalitas\' di|Munisipalitas|Emirato di|Emirato|cantón del|cantón de|canton du|cantón|Καντόνι του|Καντόνι της|Ζουπανία του|Επαρχία του|Δήμος|Κυβερνείο του|distretto|Région autonome du|région de|Governamento de|Kegubernuran|comté de|parrocchia di|obwód|, London|, Londra|, Nya Zeeland|Daerah Istimewa|Autónoma del|Parish of|Parish|, Barbados|Circondario autonomo dei|Circondario autonomo|circondario autonomo degli|Okręg Autonomiczny|Dystrykt|-Distrikt|Distrikt|distriktet|, प्रांत|, पैराग्वे|, Zambia|, Kenya|, Καμερούν|, Τζαμάικα|, Barbados|, Londra|, Bahama|kommun|Ciudad de|, Gambia|, Botswana|tartomány|körzet|Munizip|division|Conselho do Borough de|Rejon|Raionul|Kotar|megye|Żupania|comune distrettuale di|comune distrettuale|Comune di|comune|Condado de|Condado|Kotamadya|Região Autónoma dos|Região Autónoma|Região|Guvernementet|Gobernación del|Gobernación'+r')\b',
flags=re.IGNORECASE) flags=re.IGNORECASE)
admin0_regex = re.compile(r'\b('+'(district)|(địa hạt)'+r')\b', admin0_regex = re.compile(r'\b('+'(district)|(địa hạt)'+r')\b',
@ -75,7 +77,7 @@ wddic = defaultdict(dict)
wdredirects = defaultdict(dict) wdredirects = defaultdict(dict)
name_field_prefix = 'name_' name_field_prefix = 'name_'
languages = ['ar','bn','de','en','es','fr','el','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh'] languages = ['ar','bn','de','el','en','es','fa','fr','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh','zht']
new_properties = [] new_properties = []
with open(args.input_csv, newline='') as csvfile: with open(args.input_csv, newline='') as csvfile:
@ -94,7 +96,7 @@ with open(args.input_csv, newline='') as csvfile:
# as proxy for featureclass # as proxy for featureclass
# Rivers ... # Rivers ...
if args.input_shape.lower().find('river') > 0: if args.input_shape.lower().find('rivers_lake') > 0:
wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d]) wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d])
# Comma ... # Comma ...
@ -108,7 +110,7 @@ with open(args.input_csv, newline='') as csvfile:
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')] wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
# Lakes ... # Lakes ...
if args.input_shape.lower().find('lake') > 0: if args.input_shape.lower().find('lake') > 0 and args.input_shape.lower().find('10m_physical') > 0:
#if d == 'name_en' and wddic[qid]['name_en'] != 'Lake of the Woods': #if d == 'name_en' and wddic[qid]['name_en'] != 'Lake of the Woods':
wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] ) wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] )
@ -153,6 +155,19 @@ with open(args.input_csv, newline='') as csvfile:
# RTL languages and LTR figure each other out in python 3 # RTL languages and LTR figure each other out in python 3
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')] wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
# Admin 2 counties ...
if args.input_shape.lower().find('admin_2') > 0:
wddic[qid][d] = admin2_regex.sub('', wddic[qid][d])
# Parenthetical ...
if wddic[qid][d].find('(') > 0:
# RTL languages and LTR figure each other out in python 3
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
#name_ko: remove last "주""State" character
if d == 'name_ko' and wddic[qid]['name_ko'] and wddic[qid]['name_ko'][-1] == "":
wddic[qid]['name_ko'] = wddic[qid]['name_ko'][:-1]
# Admin 1 states, provinces ... # Admin 1 states, provinces ...
if args.input_shape.lower().find('admin_1') > 0: if args.input_shape.lower().find('admin_1') > 0:
wddic[qid][d] = admin1_regex.sub('', wddic[qid][d]) wddic[qid][d] = admin1_regex.sub('', wddic[qid][d])