mirror of
https://github.com/oDinZu/natural-earth-vector.git
synced 2025-02-23 00:02:17 -05:00
464 lines
17 KiB
Python
Executable File
464 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
#-- pip3 install -U SPARQLWrapper
|
|
#-- pip3 install -U fiona
|
|
#-- pip3 install -U hanzidentifier
|
|
|
|
"""
|
|
Fetch Wikidata Labels
|
|
|
|
|
|
Wikidata limitations:
|
|
population: lot of constraint violations.
|
|
https://www.wikidata.org/wiki/Property_talk:P1082#Querying_for_the_latest_value
|
|
|
|
elevation : temporary disabled , reason: SPARQL performance problem.
|
|
"""
|
|
|
|
|
|
|
|
import argparse
|
|
import csv
|
|
import sys
|
|
import time
|
|
import hanzidentifier
|
|
#import requests
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
|
|
|
|
import fiona
|
|
|
|
parser = argparse.ArgumentParser(description='Fetch wikidata labels for Natural-Earth ')
|
|
parser.add_argument('-input_shape_name',
|
|
default='../../10m_cultural/ne_10m_populated_places.shp',
|
|
help='input natural-earth shape file - with wikidataid columns')
|
|
parser.add_argument('-input_lettercase',
|
|
default='uppercase',
|
|
help='variables in thes hape file - lowercase or uppercase')
|
|
parser.add_argument('-output_csv_name',
|
|
default='ne_10m_populated_places.csv',
|
|
help='output csv file with wikidata labels')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
def get_sparql_value(sresult, variable_id):
|
|
"""
|
|
Get SPARQL value from the sresult
|
|
"""
|
|
val = ''
|
|
if variable_id in sresult:
|
|
val = sresult[variable_id]['value']
|
|
return val
|
|
|
|
def get_sparql_label(sresult, variable_id):
|
|
"""
|
|
Get SPARQL label from the sresult
|
|
"""
|
|
val = ''
|
|
if variable_id in sresult:
|
|
val = sresult[variable_id]['value'].split('#')[0].split('(')[0].split(',')[0]
|
|
return val.strip()
|
|
|
|
def get_sparql_numvalue(sresult, variable_id):
|
|
"""
|
|
Get SPARQL numeric value from the sresult
|
|
"""
|
|
val = -1
|
|
if variable_id in sresult:
|
|
val = float(sresult[variable_id]['value'])
|
|
return val
|
|
|
|
def post_process_wd_zh(properties):
|
|
""" First check whether name_zh (Simplified) and name_zht(Traditional)
|
|
are set already, if not we use the name_zh-default to backfill them.
|
|
During the backfill, if there is no Simplified Chinese, Traditional
|
|
Chinese will be used to further backfill, and vice versa
|
|
It also deletes the intermediate property `zh-default`
|
|
"""
|
|
|
|
if args.input_lettercase == "lowercase":
|
|
name_en_default = properties['name_en'] if 'name_en' in \
|
|
properties else u''
|
|
zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
|
|
properties else u''
|
|
zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
|
|
properties else u''
|
|
else:
|
|
name_en_default = properties['NAME_EN'] if 'NAME_EN' in \
|
|
properties else u''
|
|
zh_Hans_fallback = properties['NAME_ZH_HANS'] if 'NAME_ZH_HANS' in \
|
|
properties else u''
|
|
zh_Hant_fallback = properties['NAME_ZH_HANT'] if 'NAME_ZH_HANT' in \
|
|
properties else u''
|
|
|
|
# sometimes the default Chinese name has several values in a list
|
|
if 'name_zh_default' in properties:
|
|
names = properties['name_zh_default'].split('/')
|
|
for name in names:
|
|
if hanzidentifier.is_simplified(name) and \
|
|
len(zh_Hans_fallback) == 0:
|
|
zh_Hans_fallback = name
|
|
#print('found simplified name')
|
|
if hanzidentifier.is_traditional(name) and \
|
|
len(zh_Hant_fallback) == 0:
|
|
zh_Hant_fallback = name
|
|
#print('found traditional name')
|
|
|
|
# make sure we don't shove English values into Chinese namespace
|
|
if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
|
|
zh_Hans_fallback = u''
|
|
|
|
if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
|
|
zh_Hant_fallback = u''
|
|
|
|
# now make traditional and simplified Chinese name assignments
|
|
if 'name_zhs' not in properties:
|
|
if len(zh_Hans_fallback) != 0:
|
|
properties['name_zhs'] = zh_Hans_fallback
|
|
elif len(zh_Hant_fallback) != 0:
|
|
properties['name_zhs'] = zh_Hant_fallback
|
|
else:
|
|
properties['name_zhs'] = u''
|
|
|
|
if 'name_zht' not in properties:
|
|
if len(zh_Hant_fallback) != 0:
|
|
properties['name_zht'] = zh_Hant_fallback
|
|
elif len(zh_Hans_fallback) != 0:
|
|
properties['name_zht'] = zh_Hans_fallback
|
|
else:
|
|
properties['name_zht'] = u''
|
|
|
|
# only select one of the options if the field is separated by "/"
|
|
# for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
|
|
# one 旧金山市县 will be preserved
|
|
if 'name_zh' in properties:
|
|
if len(properties['name_zh']) > 0:
|
|
properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
|
|
if 'name_zht' in properties:
|
|
if len(properties['name_zht']) > 0:
|
|
properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
|
|
if 'NAME_ZH' in properties:
|
|
if len(properties['NAME_ZH']) > 0:
|
|
properties['NAME_ZH'] = properties['NAME_ZH'].split('/')[0].strip()
|
|
if 'NAME_ZHT' in properties:
|
|
if len(properties['NAME_ZHT']) > 0:
|
|
properties['NAME_ZHT'] = properties['NAME_ZHT'].split('/')[0].strip()
|
|
|
|
return properties
|
|
|
|
|
|
def fetchwikidata(a_wid):
|
|
"""
|
|
Fetch wikidata with SPARQL
|
|
"""
|
|
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
|
|
query_template = """
|
|
SELECT
|
|
?e ?i ?r ?population
|
|
?name_ar
|
|
?name_bn
|
|
?name_de
|
|
?name_el
|
|
?name_en
|
|
?name_es
|
|
?name_fa
|
|
?name_fr
|
|
?name_he
|
|
?name_hi
|
|
?name_hu
|
|
?name_id
|
|
?name_it
|
|
?name_ja
|
|
?name_ko
|
|
?name_nl
|
|
?name_pl
|
|
?name_pt
|
|
?name_ru
|
|
?name_sv
|
|
?name_tr
|
|
?name_uk
|
|
?name_ur
|
|
?name_vi
|
|
?name_zh
|
|
?name_zh_hans
|
|
?name_zh_hant
|
|
WHERE {
|
|
{
|
|
SELECT DISTINCT ?e ?i ?r
|
|
WHERE{
|
|
VALUES ?i { wd:Q2102493 wd:Q1781 }
|
|
OPTIONAL{ ?i owl:sameAs ?r. }
|
|
BIND(COALESCE(?r, ?i) AS ?e).
|
|
}
|
|
}
|
|
SERVICE wikibase:label {bd:serviceParam wikibase:language "en".}
|
|
OPTIONAL{?e wdt:P1082 ?population .}
|
|
OPTIONAL{?e rdfs:label ?name_ar FILTER((LANG(?name_ar))="ar").}
|
|
OPTIONAL{?e rdfs:label ?name_bn FILTER((LANG(?name_bn))="bn").}
|
|
OPTIONAL{?e rdfs:label ?name_de FILTER((LANG(?name_de))="de").}
|
|
OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
|
|
OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
|
|
OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
|
|
OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
|
|
OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
|
|
OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
|
|
OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
|
|
OPTIONAL{?e rdfs:label ?name_hu FILTER((LANG(?name_hu))="hu").}
|
|
OPTIONAL{?e rdfs:label ?name_id FILTER((LANG(?name_id))="id").}
|
|
OPTIONAL{?e rdfs:label ?name_it FILTER((LANG(?name_it))="it").}
|
|
OPTIONAL{?e rdfs:label ?name_ja FILTER((LANG(?name_ja))="ja").}
|
|
OPTIONAL{?e rdfs:label ?name_ko FILTER((LANG(?name_ko))="ko").}
|
|
OPTIONAL{?e rdfs:label ?name_nl FILTER((LANG(?name_nl))="nl").}
|
|
OPTIONAL{?e rdfs:label ?name_pl FILTER((LANG(?name_pl))="pl").}
|
|
OPTIONAL{?e rdfs:label ?name_pt FILTER((LANG(?name_pt))="pt").}
|
|
OPTIONAL{?e rdfs:label ?name_ru FILTER((LANG(?name_ru))="ru").}
|
|
OPTIONAL{?e rdfs:label ?name_sv FILTER((LANG(?name_sv))="sv").}
|
|
OPTIONAL{?e rdfs:label ?name_tr FILTER((LANG(?name_tr))="tr").}
|
|
OPTIONAL{?e rdfs:label ?name_uk FILTER((LANG(?name_uk))="uk").}
|
|
OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
|
|
OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
|
|
OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
|
|
OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
|
|
OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
|
|
}
|
|
|
|
"""
|
|
|
|
wikidata_sparql_ids = ""
|
|
for wid in a_wid:
|
|
wikidata_sparql_ids += " wd:"+wid
|
|
|
|
print("fetch: ", wikidata_sparql_ids.split()[1], "... ", wikidata_sparql_ids.split()[-1])
|
|
ne_query = query_template.replace('wd:Q2102493 wd:Q1781', wikidata_sparql_ids)
|
|
|
|
# compress the Query - removing the extra spaces
|
|
while ' ' in ne_query:
|
|
ne_query = ne_query.replace(' ', ' ')
|
|
|
|
results = None
|
|
retries = 0
|
|
while results is None and retries < 8:
|
|
try:
|
|
results = None
|
|
sparql.setQuery(ne_query)
|
|
sparql.setTimeout(1000)
|
|
sparql.setReturnFormat(JSON)
|
|
results = sparql.query().convert()
|
|
|
|
except SPARQLExceptions.EndPointNotFound:
|
|
print('ERRwikidata-SPARQLExceptions-EndPointNotFound: Retrying in 30 seconds.')
|
|
time.sleep(30)
|
|
retries += 1
|
|
continue
|
|
|
|
except SPARQLExceptions.EndPointInternalError as e:
|
|
print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in 30 seconds.",e)
|
|
time.sleep(30)
|
|
retries += 1
|
|
continue
|
|
|
|
except SPARQLExceptions.QueryBadFormed as e:
|
|
print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check! ",e)
|
|
return "error"
|
|
|
|
except TimeoutError as e:
|
|
print("ERRwikidata-SPARQLExceptions TimeOut : Retrying in 1 seconds.",e)
|
|
time.sleep(1)
|
|
retries += 1
|
|
continue
|
|
|
|
except KeyboardInterrupt:
|
|
# quit
|
|
sys.exit()
|
|
|
|
except:
|
|
wait = retries*5
|
|
print("ERRwikidata: other error. Retrying in "+str(wait)+" seconds.")
|
|
print('error: %s ' % sys.exc_info()[0])
|
|
time.sleep(3)
|
|
retries += 1
|
|
continue
|
|
|
|
if results is None and retries >= 8:
|
|
print("Wikidata request failed ; system stopped! ")
|
|
sys.exit(1)
|
|
|
|
|
|
return results
|
|
|
|
print('- Start fetching Natural-Earth wikidata labels via SPARQL query - ')
|
|
|
|
with open(args.output_csv_name, "w", encoding='utf-8') as f:
|
|
writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
|
|
writer.writerow((
|
|
"wd_id",
|
|
"wd_id_new",
|
|
"population",
|
|
#"elevation",
|
|
"name_ar",
|
|
"name_bn",
|
|
"name_de",
|
|
"name_el",
|
|
"name_en",
|
|
"name_es",
|
|
"name_fa",
|
|
"name_fr",
|
|
"name_he",
|
|
"name_hi",
|
|
"name_hu",
|
|
"name_id",
|
|
"name_it",
|
|
"name_ja",
|
|
"name_ko",
|
|
"name_nl",
|
|
"name_pl",
|
|
"name_pt",
|
|
"name_ru",
|
|
"name_sv",
|
|
"name_tr",
|
|
"name_uk",
|
|
"name_ur",
|
|
"name_vi",
|
|
"name_zh",
|
|
"name_zht"
|
|
))
|
|
|
|
|
|
|
|
|
|
with fiona.open(args.input_shape_name, 'r') as shape_input:
|
|
i = 0
|
|
REC_IN_SHAPE = len(shape_input)
|
|
wikidata_chunk = list()
|
|
for pt in shape_input:
|
|
i = i+1
|
|
|
|
if args.input_lettercase == "lowercase":
|
|
ne_wikidataid = pt['properties']['wikidataid']
|
|
else:
|
|
ne_wikidataid = pt['properties']['WIKIDATAID']
|
|
|
|
ne_fid = pt['id']
|
|
|
|
if ne_wikidataid:
|
|
if ne_wikidataid[0] == 'Q':
|
|
wikidata_chunk.append(ne_wikidataid)
|
|
else:
|
|
print("ERROR: Bad formatted wikidataid , skip", ne_wikidataid)
|
|
|
|
if (len(wikidata_chunk) >= 200) or (i >= REC_IN_SHAPE):
|
|
|
|
sparql_results = fetchwikidata(wikidata_chunk)
|
|
wikidata_chunk = []
|
|
|
|
for result in sparql_results["results"]["bindings"]:
|
|
#print(result)
|
|
wd_id_label = get_sparql_value(result, 'e').split('/')[4]
|
|
wd_id = get_sparql_value(result, 'i').split('/')[4]
|
|
wd_id_new = get_sparql_value(result, 'r')
|
|
if wd_id_new:
|
|
wd_id_new = wd_id_new.split('/')[4]
|
|
print('Redirected:', wd_id, wd_id_new)
|
|
population = get_sparql_value(result, 'population')
|
|
#elevation =get_sparql_value(result, 'elevation')
|
|
|
|
name_ar = get_sparql_label(result, 'name_ar')
|
|
name_bn = get_sparql_label(result, 'name_bn')
|
|
name_de = get_sparql_label(result, 'name_de')
|
|
name_el = get_sparql_label(result, 'name_el')
|
|
name_en = get_sparql_label(result, 'name_en')
|
|
name_es = get_sparql_label(result, 'name_es')
|
|
name_fa = get_sparql_label(result, 'name_fa')
|
|
name_fr = get_sparql_label(result, 'name_fr')
|
|
name_he = get_sparql_label(result, 'name_he')
|
|
name_hi = get_sparql_label(result, 'name_hi')
|
|
name_hu = get_sparql_label(result, 'name_hu')
|
|
name_id = get_sparql_label(result, 'name_id')
|
|
name_it = get_sparql_label(result, 'name_it')
|
|
name_ja = get_sparql_label(result, 'name_ja')
|
|
name_ko = get_sparql_label(result, 'name_ko')
|
|
name_lt = get_sparql_label(result, 'name_lt')
|
|
name_nl = get_sparql_label(result, 'name_nl')
|
|
name_pl = get_sparql_label(result, 'name_pl')
|
|
name_pt = get_sparql_label(result, 'name_pt')
|
|
name_ru = get_sparql_label(result, 'name_ru')
|
|
name_sv = get_sparql_label(result, 'name_sv')
|
|
name_tr = get_sparql_label(result, 'name_tr')
|
|
name_uk = get_sparql_label(result, 'name_uk')
|
|
name_ur = get_sparql_label(result, 'name_ur')
|
|
name_vi = get_sparql_label(result, 'name_vi')
|
|
|
|
# not all Wikidata places have all name (label) translations
|
|
try:
|
|
name_en = get_sparql_label(result, 'name_en')
|
|
except:
|
|
name_en = u''
|
|
|
|
try:
|
|
name_zh_default = get_sparql_label(result, 'name_zh')
|
|
except:
|
|
name_zh_default = u''
|
|
|
|
try:
|
|
name_zh_hans = get_sparql_label(result, 'name_zh_hans')
|
|
except:
|
|
name_zh_hans = u''
|
|
|
|
try:
|
|
name_zh_hant = get_sparql_label(result, 'name_zh_hant')
|
|
except:
|
|
name_zh_hant = u''
|
|
|
|
chinese_names = { 'name_en' : name_en,
|
|
'name_zh_default' : name_zh_default,
|
|
'name_zh_hans' : name_zh_hans,
|
|
'name_zh_hant' : name_zh_hant
|
|
}
|
|
|
|
processed_chinese_names = post_process_wd_zh( chinese_names )
|
|
|
|
try:
|
|
name_zh = processed_chinese_names['name_zhs']
|
|
except:
|
|
name_zh = u''
|
|
try:
|
|
name_zht = processed_chinese_names['name_zht']
|
|
except:
|
|
name_zht = u''
|
|
|
|
writer.writerow((
|
|
wd_id,
|
|
wd_id_new,
|
|
population,
|
|
#elevation,
|
|
name_ar,
|
|
name_bn,
|
|
name_de,
|
|
name_el,
|
|
name_en,
|
|
name_es,
|
|
name_fa,
|
|
name_fr,
|
|
name_he,
|
|
name_hi,
|
|
name_hu,
|
|
name_id,
|
|
name_it,
|
|
name_ja,
|
|
name_ko,
|
|
name_nl,
|
|
name_pl,
|
|
name_pt,
|
|
name_ru,
|
|
name_sv,
|
|
name_tr,
|
|
name_uk,
|
|
name_ur,
|
|
name_vi,
|
|
name_zh,
|
|
name_zht
|
|
))
|
|
|
|
print(' - JOB end -') |