mirror of
https://github.com/oDinZu/natural-earth-vector.git
synced 2025-02-23 00:02:17 -05:00
230 lines
9.4 KiB
Python
Executable File
230 lines
9.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
#-- pip3 install -U fiona
|
|
#--- pip3 install -U csvtomd
|
|
|
|
"""
|
|
Write new wikidata labels ...
|
|
"""
|
|
|
|
|
|
import argparse
|
|
import csv
|
|
import logging
|
|
import sys
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
import fiona
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
|
|
|
|
parser = argparse.ArgumentParser(description='Update wikidata labels for Natural-Earth ')
|
|
parser.add_argument('-input_shape',
|
|
default='../../10m_cultural/ne_10m_populated_places.shp',
|
|
help='input natural-earth shape file - with wikidataid columns')
|
|
parser.add_argument('-input_lettercase',
|
|
default='lowercase',
|
|
help='variables in the shape file - lowercase or uppercase')
|
|
parser.add_argument('-input_csv',
|
|
default='ne_10m_populated_places.csv',
|
|
help='input csv file with the latest wikidataid name_XX columns')
|
|
parser.add_argument('-output_shape',
|
|
default='ne_10m_populated_places_latest.shp',
|
|
help='output natural-earth shape file - with updated wikidataid name_XX columns')
|
|
parser.add_argument('-output_csvlog',
|
|
default='ne_10m_populated_places_latest.log.csv',
|
|
help='audit log about the changes')
|
|
parser.add_argument('-output_csvsumlog',
|
|
default='ne_10m_populated_places_latest.sumlog.csv',
|
|
help='audit sum log about the changes')
|
|
|
|
args = parser.parse_args()
|
|
|
|
#lakeclean_regex = re.compile(r'\b('+'Lake'+r')\b', flags=re.IGNORECASE)
|
|
riverclean_regex = re.compile(r'\b('+'River'+r')\b',
|
|
flags=re.IGNORECASE)
|
|
placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation'+r')\b',
|
|
flags=re.IGNORECASE)
|
|
|
|
|
|
def isNotEmpty(s):
|
|
"""
|
|
Check String is not empty
|
|
"""
|
|
return bool(s and s.strip())
|
|
|
|
|
|
|
|
# read csv data
|
|
wddic = defaultdict(dict)
|
|
wdredirects = defaultdict(dict)
|
|
with open(args.input_csv, newline='') as csvfile:
|
|
reader = csv.DictReader(csvfile)
|
|
for row in reader:
|
|
qid = row['wd_id']
|
|
for d in row:
|
|
if d != 'wd_id':
|
|
#print(d, d[0:5])
|
|
if d[0:5] == 'name_':
|
|
wddic[qid][d] = row[d].strip()
|
|
|
|
# Clean names ...
|
|
|
|
# Rivers ...
|
|
if args.input_shape.lower().find('river') > 0:
|
|
wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d])
|
|
|
|
# Lakes ...
|
|
# if args.input_shape.lower().find('lake') > 0:
|
|
# wddic[qid][d]=lakeclean_regex.sub('', wddic[qid][d] )
|
|
|
|
# Places ...
|
|
if args.input_shape.lower().find('place') > 0:
|
|
wddic[qid][d] = placeclean_regex.sub('', wddic[qid][d])
|
|
#name_zh: remove last "市""City" character
|
|
if d == 'name_zh' and wddic[qid]['name_zh'] and wddic[qid]['name_zh'][-1] == "市":
|
|
wddic[qid]['name_zh'] = wddic[qid]['name_zh'][:-1]
|
|
|
|
wddic[qid][d] = wddic[qid][d].strip()
|
|
|
|
if wddic[qid][d] != row[d].strip():
|
|
print(qid, d, ' name cleaning : ', row[d].strip(), ' --> ', wddic[qid][d])
|
|
|
|
# Add an uppercase version
|
|
if args.input_lettercase != "lowercase":
|
|
wddic[qid][d.upper()] = wddic[qid][d] # - add uppercase version
|
|
del wddic[qid][d] # - remove default lowcase
|
|
|
|
if row['wd_id_new'] != '':
|
|
wdredirects[qid] = row['wd_id_new']
|
|
|
|
|
|
with open(args.output_csvlog, "w", encoding='utf-8') as f:
|
|
writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
|
|
writer.writerow(("wd_id", "status", "variable", "value_old", "value_new"))
|
|
|
|
with fiona.open(args.input_shape, 'r', encoding='utf-8', driver="ESRI Shapefile") as source:
|
|
# **source.meta is a shortcut to get the crs, driver, and schema
|
|
# keyword arguments from the source Collection.
|
|
with fiona.open(args.output_shape, 'w', encoding='utf-8', **source.meta) as sink:
|
|
|
|
stat_equal = 0
|
|
stat_empty = 0
|
|
stat_new = 0
|
|
stat_del = 0
|
|
stat_mod = 0
|
|
|
|
stat_wikidataid_redirect = 0
|
|
stat_wikidataid_notfound = 0
|
|
stat_wikidataid_null = 0
|
|
stat_wikidataid_badformated = 0
|
|
stat_wikidataid_notnull = 0
|
|
|
|
for f in source:
|
|
|
|
try:
|
|
|
|
if args.input_lettercase == "lowercase":
|
|
qid = f['properties']['wikidataid']
|
|
else:
|
|
qid = f['properties']['WIKIDATAID']
|
|
|
|
if not qid:
|
|
qid = ''
|
|
|
|
if qid.startswith('Q'):
|
|
stat_wikidataid_notnull += 1
|
|
|
|
if qid in wddic:
|
|
#Update wikidataid if redirects
|
|
if qid in wdredirects:
|
|
writer.writerow((qid, "REDIRECT", 'wikidataid', qid, wdredirects[qid]))
|
|
stat_wikidataid_redirect += 1
|
|
|
|
if args.input_lettercase == "lowercase":
|
|
f['properties']['wikidataid'] = wdredirects[qid]
|
|
else:
|
|
f['properties']['WIKIDATAID'] = wdredirects[qid]
|
|
|
|
for updatefield in wddic[qid]:
|
|
|
|
if not f['properties'][updatefield]:
|
|
f['properties'][updatefield] = ''
|
|
else:
|
|
f['properties'][updatefield] = f['properties'][updatefield].strip()
|
|
|
|
|
|
if not wddic[qid][updatefield]:
|
|
wddic[qid][updatefield] = ''
|
|
else:
|
|
wddic[qid][updatefield] = wddic[qid][updatefield].strip()
|
|
|
|
|
|
if updatefield in f['properties']:
|
|
|
|
if f['properties'][updatefield] != wddic[qid][updatefield]:
|
|
|
|
if f['properties'][updatefield] == '':
|
|
status = 'NEWvalue'
|
|
stat_new += 1
|
|
elif wddic[qid][updatefield] == '':
|
|
status = 'DELvalue'
|
|
stat_del += 1
|
|
else:
|
|
status = 'MODvalue'
|
|
stat_mod += 1
|
|
|
|
writer.writerow((qid, status, updatefield, f['properties'][updatefield], wddic[qid][updatefield]))
|
|
f['properties'][updatefield] = wddic[qid][updatefield]
|
|
|
|
else:
|
|
if isNotEmpty(f['properties'][updatefield]):
|
|
stat_equal += 1
|
|
else:
|
|
stat_empty += 1
|
|
else:
|
|
print("...ERROR.. updatefield is missing", updatefield)
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
else:
|
|
print("WARNIMG: Notfound wikidataid - please check :", qid)
|
|
stat_wikidataid_notfound += 1
|
|
|
|
else:
|
|
|
|
if qid == '':
|
|
stat_wikidataid_null += 1
|
|
else:
|
|
stat_wikidataid_badformated += 1
|
|
print("WARNING: Bad formatted wikidataid , skip", qid)
|
|
|
|
sink.write(f)
|
|
|
|
except Exception as e:
|
|
logging.exception("Error processing feature %s:", f['id'])
|
|
print(e)
|
|
sys.exit(1)
|
|
|
|
|
|
# Write statistics ...
|
|
with open(args.output_csvsumlog, "w", encoding='utf-8') as s:
|
|
sumWriter = csv.writer(s, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
|
|
sumWriter.writerow(("shapefilename", "var", "value"))
|
|
|
|
sumWriter.writerow((args.input_shape, 'New_name', stat_new))
|
|
sumWriter.writerow((args.input_shape, 'Deleted_name', stat_del))
|
|
sumWriter.writerow((args.input_shape, 'Modified_name', stat_mod))
|
|
sumWriter.writerow((args.input_shape, 'Empty_name', stat_empty))
|
|
sumWriter.writerow((args.input_shape, 'Same_name', stat_equal))
|
|
sumWriter.writerow((args.input_shape, 'Wikidataid_redirected', stat_wikidataid_redirect))
|
|
sumWriter.writerow((args.input_shape, 'Wikidataid_notfound', stat_wikidataid_notfound))
|
|
sumWriter.writerow((args.input_shape, 'Wikidataid_null', stat_wikidataid_null))
|
|
sumWriter.writerow((args.input_shape, 'Wikidataid_notnull', stat_wikidataid_notnull))
|
|
sumWriter.writerow((args.input_shape, 'Wikidataid_badformated', stat_wikidataid_badformated))
|