mirror of
https://github.com/oDinZu/natural-earth-vector.git
synced 2025-02-22 00:04:57 -05:00
update Wikidata scripts for Farsi, Chinese simplified, Chinese traditional
This commit is contained in:
parent
75feea9848
commit
04e61779fe
11
run_all.sh
11
run_all.sh
@ -33,14 +33,18 @@ rm -f $logmd
|
||||
# | mode |LetterCase| shape_path | shape filename
|
||||
# == 10m ================= |=========== |==========| ============| ================================================
|
||||
function run10m {
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty # this and other admin_0 run, but Mapshaper overwrites them
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries # instead results are copied into housekeeping file's lookup table
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries_lakes
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_units
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_subunits
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_disputed_areas
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces # this and other admin_1 run, but Mapshaper overwrites them
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # Mapshaper uses this to generate admin_1 polys
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_label_points_details # Mapshaper uses this to generate admin_2 polys
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties # this and other admin_2 run, but Mapshaper overwrites them
|
||||
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_2_counties_lakes
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_airports
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_populated_places # this should be build before derived Makefile themes run
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geographic_lines # this should be build before derived Makefile themes run
|
||||
@ -57,7 +61,6 @@ function run10m {
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_europe
|
||||
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_north_america
|
||||
#./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details # geometry errors
|
||||
}
|
||||
|
||||
function run50m {
|
||||
|
@ -6,6 +6,7 @@ install:
|
||||
pip3 install -U SPARQLWrapper
|
||||
pip3 install -U fiona
|
||||
pip3 install -U csvtomd
|
||||
pip3 install -U hanzidentifier
|
||||
|
||||
clean:
|
||||
cd ../.. && rm -rf temp_shape/*
|
||||
|
@ -6,6 +6,7 @@
|
||||
pip3 install -U fiona
|
||||
pip3 install -U csvtomd
|
||||
pip3 install -U requests
|
||||
pip3 install -U hanzidentifier
|
||||
|
||||
#run from the project root ( expected 30-40 minutes )
|
||||
# be careful this is running 'make all'
|
||||
@ -357,34 +358,36 @@ lettercase = lowercase variable names [wikidataid, name_ar, name_bn, name_de, na
|
||||
see the _latest_ information in the `./run_all.sh`
|
||||
|
||||
|
||||
### supported languages ( now: 24)
|
||||
### supported languages ( now: 26)
|
||||
|
||||
variable name | language | language wikipedia link
|
||||
--------------|--------------|----------------------------------------------------
|
||||
NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic
|
||||
NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language
|
||||
NAME_DE | German | https://en.wikipedia.org/wiki/German_language
|
||||
NAME_EN | English | https://en.wikipedia.org/wiki/English_language
|
||||
NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language
|
||||
NAME_FR | French | https://en.wikipedia.org/wiki/French_language
|
||||
NAME_EL | Modern Greek | https://en.wikipedia.org/wiki/Modern_Greek
|
||||
NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language
|
||||
NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi
|
||||
NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language
|
||||
NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language
|
||||
NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language
|
||||
NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language
|
||||
NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language
|
||||
NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language
|
||||
NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language
|
||||
NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language
|
||||
NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language
|
||||
NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language
|
||||
NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language
|
||||
NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language
|
||||
NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu
|
||||
NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language
|
||||
NAME_ZH | Chinese | https://en.wikipedia.org/wiki/Chinese_language
|
||||
variable name | language | language wikipedia link
|
||||
--------------|-----------------------|----------------------------------------------------
|
||||
NAME_AR | Arabic | https://en.wikipedia.org/wiki/Arabic
|
||||
NAME_BN | Bengali | https://en.wikipedia.org/wiki/Bengali_language
|
||||
NAME_DE | German | https://en.wikipedia.org/wiki/German_language
|
||||
NAME_EN | English | https://en.wikipedia.org/wiki/English_language
|
||||
NAME_EL | Greek (modern) | https://en.wikipedia.org/wiki/Modern_Greek
|
||||
NAME_ES | Spanish | https://en.wikipedia.org/wiki/Spanish_language
|
||||
NAME_FA | Farsi | https://en.wikipedia.org/wiki/Persian_language
|
||||
NAME_FR | French | https://en.wikipedia.org/wiki/French_language
|
||||
NAME_HE | Hebrew | https://en.wikipedia.org/wiki/Hebrew_language
|
||||
NAME_HI | Hindi | https://en.wikipedia.org/wiki/Hindi
|
||||
NAME_HU | Hungarian | https://en.wikipedia.org/wiki/Hungarian_language
|
||||
NAME_ID | Indonesian | https://en.wikipedia.org/wiki/Indonesian_language
|
||||
NAME_IT | Italian | https://en.wikipedia.org/wiki/Italian_language
|
||||
NAME_JA | Japanese | https://en.wikipedia.org/wiki/Japanese_language
|
||||
NAME_KO | Korean | https://en.wikipedia.org/wiki/Korean_language
|
||||
NAME_NL | Dutch | https://en.wikipedia.org/wiki/Dutch_language
|
||||
NAME_PL | Polish | https://en.wikipedia.org/wiki/Polish_language
|
||||
NAME_PT | Portuguese | https://en.wikipedia.org/wiki/Portuguese_language
|
||||
NAME_RU | Russian | https://en.wikipedia.org/wiki/Russian_language
|
||||
NAME_SV | Swedish | https://en.wikipedia.org/wiki/Swedish_language
|
||||
NAME_TR | Turkish | https://en.wikipedia.org/wiki/Turkish_language
|
||||
NAME_UK | Ukrainian | https://en.wikipedia.org/wiki/Ukrainian_language
|
||||
NAME_UR | Urdu | https://en.wikipedia.org/wiki/Urdu
|
||||
NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language
|
||||
NAME_ZH | Chinese (simplified) | https://en.wikipedia.org/wiki/Chinese_language
|
||||
NAME_ZHT | Chinese (traditional) | https://en.wikipedia.org/wiki/Traditional_Chinese_characters
|
||||
|
||||
# Name cleaning
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#-- pip3 install -U SPARQLWrapper
|
||||
#-- pip3 install -U fiona
|
||||
#-- pip3 install -U hanzidentifier
|
||||
|
||||
"""
|
||||
Fetch Wikidata Labels
|
||||
@ -20,6 +21,7 @@ import argparse
|
||||
import csv
|
||||
import sys
|
||||
import time
|
||||
import hanzidentifier
|
||||
#import requests
|
||||
|
||||
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
|
||||
@ -31,7 +33,7 @@ parser.add_argument('-input_shape_name',
|
||||
default='../../10m_cultural/ne_10m_populated_places.shp',
|
||||
help='input natural-earth shape file - with wikidataid columns')
|
||||
parser.add_argument('-input_lettercase',
|
||||
default='lowercase',
|
||||
default='uppercase',
|
||||
help='variables in thes hape file - lowercase or uppercase')
|
||||
parser.add_argument('-output_csv_name',
|
||||
default='ne_10m_populated_places.csv',
|
||||
@ -67,13 +69,75 @@ def get_sparql_numvalue(sresult, variable_id):
|
||||
val = float(sresult[variable_id]['value'])
|
||||
return val
|
||||
|
||||
def post_process_wd_zh(properties):
|
||||
""" First check whether name_zh (Simplified) and name_zht(Traditional)
|
||||
are set already, if not we use the name_zh-default to backfill them.
|
||||
During the backfill, if there is no Simplified Chinese, Traditional
|
||||
Chinese will be used to further backfill, and vice versa
|
||||
It also deletes the intermediate property `zh-default`
|
||||
"""
|
||||
|
||||
name_en_default = properties['name_en'] if 'name_en' in \
|
||||
properties else u''
|
||||
zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
|
||||
properties else u''
|
||||
zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
|
||||
properties else u''
|
||||
|
||||
# sometimes the default Chinese name has several values in a list
|
||||
if 'name_zh_default' in properties:
|
||||
names = properties['name_zh_default'].split('/')
|
||||
for name in names:
|
||||
if hanzidentifier.is_simplified(name) and \
|
||||
len(zh_Hans_fallback) == 0:
|
||||
zh_Hans_fallback = name
|
||||
#print('found simplified name')
|
||||
if hanzidentifier.is_traditional(name) and \
|
||||
len(zh_Hant_fallback) == 0:
|
||||
zh_Hant_fallback = name
|
||||
#print('found traditional name')
|
||||
|
||||
# make sure we don't shove English values into Chinese namespace
|
||||
if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
|
||||
zh_Hans_fallback = u''
|
||||
|
||||
if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
|
||||
zh_Hant_fallback = u''
|
||||
|
||||
# now make traditional and simplified Chinese name assignments
|
||||
if 'name_zhs' not in properties:
|
||||
if len(zh_Hans_fallback) != 0:
|
||||
properties['name_zhs'] = zh_Hans_fallback
|
||||
elif len(zh_Hant_fallback) != 0:
|
||||
properties['name_zhs'] = zh_Hant_fallback
|
||||
else:
|
||||
properties['name_zhs'] = u''
|
||||
|
||||
if 'name_zht' not in properties:
|
||||
if len(zh_Hant_fallback) != 0:
|
||||
properties['name_zht'] = zh_Hant_fallback
|
||||
elif len(zh_Hans_fallback) != 0:
|
||||
properties['name_zht'] = zh_Hans_fallback
|
||||
else:
|
||||
properties['name_zht'] = u''
|
||||
|
||||
# only select one of the options if the field is separated by "/"
|
||||
# for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
|
||||
# one 旧金山市县 will be preserved
|
||||
if len(properties['name_zh']) != 0:
|
||||
properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
|
||||
if len(properties['name_zht']) != 0:
|
||||
properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
|
||||
|
||||
return properties
|
||||
|
||||
|
||||
def fetchwikidata(a_wid):
|
||||
"""
|
||||
Fetch wikidata with SPARQL
|
||||
"""
|
||||
|
||||
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.0 (github.com/nvkelso/natural-earth-vector)')
|
||||
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
|
||||
query_template = """
|
||||
SELECT
|
||||
?e ?i ?r ?population
|
||||
@ -102,6 +166,8 @@ def fetchwikidata(a_wid):
|
||||
?name_ur
|
||||
?name_vi
|
||||
?name_zh
|
||||
?name_zh_hans
|
||||
?name_zh_hant
|
||||
WHERE {
|
||||
{
|
||||
SELECT DISTINCT ?e ?i ?r
|
||||
@ -119,7 +185,7 @@ def fetchwikidata(a_wid):
|
||||
OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
|
||||
OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
|
||||
OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
|
||||
OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fr))="fa").}
|
||||
OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
|
||||
OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
|
||||
OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
|
||||
OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
|
||||
@ -138,6 +204,8 @@ def fetchwikidata(a_wid):
|
||||
OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
|
||||
OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
|
||||
OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
|
||||
OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
|
||||
OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
|
||||
}
|
||||
|
||||
"""
|
||||
@ -237,7 +305,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
|
||||
"name_uk",
|
||||
"name_ur",
|
||||
"name_vi",
|
||||
"name_zh"
|
||||
"name_zh",
|
||||
"name_zht"
|
||||
))
|
||||
|
||||
|
||||
@ -285,12 +354,13 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
|
||||
name_el = get_sparql_label(result, 'name_el')
|
||||
name_en = get_sparql_label(result, 'name_en')
|
||||
name_es = get_sparql_label(result, 'name_es')
|
||||
name_fr = get_sparql_label(result, 'name_fa')
|
||||
name_fa = get_sparql_label(result, 'name_fa')
|
||||
name_fr = get_sparql_label(result, 'name_fr')
|
||||
name_he = get_sparql_label(result, 'name_he')
|
||||
name_hi = get_sparql_label(result, 'name_hi')
|
||||
name_hu = get_sparql_label(result, 'name_hu')
|
||||
name_id = get_sparql_label(result, 'name_id')
|
||||
name_it = get_sparql_label(result, 'name_it')
|
||||
name_ja = get_sparql_label(result, 'name_ja')
|
||||
name_ko = get_sparql_label(result, 'name_ko')
|
||||
name_lt = get_sparql_label(result, 'name_lt')
|
||||
@ -303,7 +373,44 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
|
||||
name_uk = get_sparql_label(result, 'name_uk')
|
||||
name_ur = get_sparql_label(result, 'name_ur')
|
||||
name_vi = get_sparql_label(result, 'name_vi')
|
||||
name_zh = get_sparql_label(result, 'name_zh')
|
||||
|
||||
# not all Wikidata places have all name (label) translations
|
||||
try:
|
||||
name_en = get_sparql_label(result, 'name_en')
|
||||
except:
|
||||
name_en = u''
|
||||
|
||||
try:
|
||||
name_zh_default = get_sparql_label(result, 'name_zh')
|
||||
except:
|
||||
name_zh_default = u''
|
||||
|
||||
try:
|
||||
name_zh_hans = get_sparql_label(result, 'name_zh_hans')
|
||||
except:
|
||||
name_zh_hans = u''
|
||||
|
||||
try:
|
||||
name_zh_hant = get_sparql_label(result, 'name_zh_hant')
|
||||
except:
|
||||
name_zh_hant = u''
|
||||
|
||||
chinese_names = { 'name_en' : name_en,
|
||||
'name_zh_default' : name_zh_default,
|
||||
'name_zh_hans' : name_zh_hans,
|
||||
'name_zh_hant' : name_zh_hant
|
||||
}
|
||||
|
||||
processed_chinese_names = post_process_wd_zh( chinese_names )
|
||||
|
||||
try:
|
||||
name_zh = processed_chinese_names['name_zhs']
|
||||
except:
|
||||
name_zh = u''
|
||||
try:
|
||||
name_zht = processed_chinese_names['name_zht']
|
||||
except:
|
||||
name_zht = u''
|
||||
|
||||
writer.writerow((
|
||||
wd_id,
|
||||
@ -334,7 +441,8 @@ with open(args.output_csv_name, "w", encoding='utf-8') as f:
|
||||
name_uk,
|
||||
name_ur,
|
||||
name_vi,
|
||||
name_zh
|
||||
name_zh,
|
||||
name_zht
|
||||
))
|
||||
|
||||
print(' - JOB end -')
|
@ -44,11 +44,11 @@ parser.add_argument('-output_csvsumlog',
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang'+r')\b',
|
||||
riverclean_regex = re.compile(r'\b('+'River|rivière de la |rivière à la|rivière des|Rivière De|Rivière du|Rivière aux|Rivière|rivier|Rio dos|Rio|Río La|Río de los|Río de las|Río dos|Río|sông|-folyó|folyó|canale di|canale|Nehri|Jiang|Sungai'+r')\b',
|
||||
flags=re.IGNORECASE)
|
||||
# Some of these are proper names (Lake of the Ozark's, Clear Lake Reservoir) and
|
||||
# shouldn't be stripped, but in the meantime, strip aggressively
|
||||
lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ'+r')\b',
|
||||
lakeclean_regex = re.compile(r'\b('+'Lake of the|Grand Lake o\' the|Lake Reservoir|Grand Lake|Grant Lake|Lake of|Lake|Lago degli|Lago del|Lago de|lago di|Lago la|Lago do|Lago |lago d\'||Lago|Lac de la|lac d\'|lac des|Lac des|lac de|Lac de|Lac au|lac di|lac la|Lac La|Lac à l’|lac à la|Lac|lac|-See|See|Laguna de|Laguna|Lake Reservoir|Reservoir|réservoir de la|réservoir de|Reservatório de|réservoir|Réservoir|Represa de|Represa|baie de|Bahía de|öböl|Gölü|järv|Embalse de|Embalse|Bacino di|bacino di|Bacino|bacino|Sông|Lough|Hồ|Danau'+r')\b',
|
||||
flags=re.IGNORECASE)
|
||||
#geolabels_regex = re.compile(r'\b('+'(wyspa)'+r')\b',
|
||||
# flags=re.IGNORECASE)
|
||||
@ -56,6 +56,8 @@ placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation|
|
||||
flags=re.IGNORECASE)
|
||||
geo_region_regex = re.compile(r'\b('+'Região Autónoma dos'+r')\b',
|
||||
flags=re.IGNORECASE)
|
||||
admin2_regex = re.compile(r'\b('+'County|Condado de|comté de|contea di|comté d|megye|Hrabstwo|Condado de|Quận|مقاطعة|City and Borough|census area di|Census Area|Borough|borough di|borough de|ilçesi'+r')\b',
|
||||
flags=re.IGNORECASE)
|
||||
admin1_regex = re.compile(r'\b('+'canton of |Canton of|Department|District Council|distretto di contea di|contea di|District de|distretto di|Distretto della|Distrik|distretto del|district|Constitutional Province|Province of |Provincia del|provincia delle|provincia della|provincia di|Provincia Constitucional del|Província Constitucional de|provincia de|Província de|Província do|Provincia de|Préfecture de|Provincia|Comunidad De|Autonome Provinz|Provincia Autónoma de|Provinz|Província|Departamento de|Departamento do|Autonomous Province of|Autonomous Province|Province de la|province de|Province du|Province|Provinsi|Municipality of |Municipality|Município de|Special Region of |Región Metropolitana de|Special Region|Autonomous Region|Capital Region of|Region of|-Region|Region|Governorate|Gouvernorat|Gubernatorstwo|Gobernación de|governatorato di|Capital of|Capital of|City Council|City and Borough of|City of|Città di|City|Región Metropolitana de|Metropolitan Region|Metropolitan Borough of|Metropolitan Borough|Borough Metropolitano de|London Borough of|district londonien|district londonien d\'|borough royal de|Royal Borough of|County Borough|Borough of|Borough Council|Metropoliten Borough|londonien de|district royal de|County|Old Royal Capital|(distrikt)|Distrik|(borough)|Cantão central de|cantone della|(cantão)|(departamento)|(departement)|Región del|Región de|Región|gouvernorat de|Gouvernorat|kormányzóság|regione di|Regione del|Prefectura de|prefettura di|Autonome Oblast|Oblast Autônomo|Autonomous Oblast|Oblast\' dell\'|Obwód Autonomiczny|Kraï de|Kraï|Oblast de|Óblast de|Oblast\' di|Oblast\'|oblast|distrito de|Distrito do|Distrito|métropolitain de|Voivodia da|cantone di|cantone dell\'|Munisipalitas\' di|Munisipalitas|Emirato di|Emirato|cantón del|cantón de|canton du|cantón|Καντόνι του|Καντόνι της|Ζουπανία του|Επαρχία του|Δήμος|Κυβερνείο του|distretto|Région autonome du|région de|Governamento de|Kegubernuran|comté de|parrocchia di|obwód|, London|, Londra|, Nya Zeeland|Daerah Istimewa|Autónoma del|Parish of|Parish|, Barbados|Circondario autonomo dei|Circondario autonomo|circondario autonomo degli|Okręg Autonomiczny|Dystrykt|-Distrikt|Distrikt|distriktet|, प्रांत|, पैराग्वे|, Zambia|, Kenya|, Καμερούν|, Τζαμάικα|, Barbados|, Londra|, Bahama|kommun|Ciudad de|, Gambia|, Botswana|tartomány|körzet|Munizip|division|Conselho do Borough de|Rejon|Raionul|Kotar|megye|Żupania|comune distrettuale di|comune distrettuale|Comune di|comune|Condado de|Condado|Kotamadya|Região Autónoma dos|Região Autónoma|Região|Guvernementet|Gobernación del|Gobernación'+r')\b',
|
||||
flags=re.IGNORECASE)
|
||||
admin0_regex = re.compile(r'\b('+'(district)|(địa hạt)'+r')\b',
|
||||
@ -75,7 +77,7 @@ wddic = defaultdict(dict)
|
||||
wdredirects = defaultdict(dict)
|
||||
|
||||
name_field_prefix = 'name_'
|
||||
languages = ['ar','bn','de','en','es','fr','el','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh']
|
||||
languages = ['ar','bn','de','el','en','es','fa','fr','he','hi','hu','id','it','ja','ko','nl','pl','pt','ru','sv','tr','uk','ur','vi','zh','zht']
|
||||
new_properties = []
|
||||
|
||||
with open(args.input_csv, newline='') as csvfile:
|
||||
@ -94,7 +96,7 @@ with open(args.input_csv, newline='') as csvfile:
|
||||
# as proxy for featureclass
|
||||
|
||||
# Rivers ...
|
||||
if args.input_shape.lower().find('river') > 0:
|
||||
if args.input_shape.lower().find('rivers_lake') > 0:
|
||||
wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d])
|
||||
|
||||
# Comma ...
|
||||
@ -108,7 +110,7 @@ with open(args.input_csv, newline='') as csvfile:
|
||||
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
|
||||
|
||||
# Lakes ...
|
||||
if args.input_shape.lower().find('lake') > 0:
|
||||
if args.input_shape.lower().find('lake') > 0 and args.input_shape.lower().find('10m_physical') > 0:
|
||||
#if d == 'name_en' and wddic[qid]['name_en'] != 'Lake of the Woods':
|
||||
wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] )
|
||||
|
||||
@ -153,6 +155,19 @@ with open(args.input_csv, newline='') as csvfile:
|
||||
# RTL languages and LTR figure each other out in python 3
|
||||
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
|
||||
|
||||
# Admin 2 counties ...
|
||||
if args.input_shape.lower().find('admin_2') > 0:
|
||||
wddic[qid][d] = admin2_regex.sub('', wddic[qid][d])
|
||||
|
||||
# Parenthetical ...
|
||||
if wddic[qid][d].find('(') > 0:
|
||||
# RTL languages and LTR figure each other out in python 3
|
||||
wddic[qid][d] = wddic[qid][d][0:wddic[qid][d].find('(')]
|
||||
|
||||
#name_ko: remove last "주""State" character
|
||||
if d == 'name_ko' and wddic[qid]['name_ko'] and wddic[qid]['name_ko'][-1] == "군":
|
||||
wddic[qid]['name_ko'] = wddic[qid]['name_ko'][:-1]
|
||||
|
||||
# Admin 1 states, provinces ...
|
||||
if args.input_shape.lower().find('admin_1') > 0:
|
||||
wddic[qid][d] = admin1_regex.sub('', wddic[qid][d])
|
||||
|
Loading…
x
Reference in New Issue
Block a user