wikidata: documentation, some minor changes

This commit is contained in:
ImreSamu 2018-05-20 20:31:27 +02:00
parent 09fc9316aa
commit 564f006bf8
4 changed files with 390 additions and 151 deletions

View File

@ -9,7 +9,7 @@ matrix:
include:
- os: osx
language: generic
env:
env:
- PY=3.6
- nedocker=NO
before_install:
@ -40,10 +40,10 @@ matrix:
# - os: linux
# env:
# env:
# - PY=3.6
# - nedocker=NO
# sudo: required
# sudo: required
# before_install:
# - sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes
# - sudo apt-get --yes --force-yes update -qq
@ -71,15 +71,15 @@ matrix:
# python: 3.6
# os: linux
# env:
# - nedocker=NO
# sudo: required
# - nedocker=NO
# sudo: required
# before_install:
# - sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes
# - sudo apt-get --yes --force-yes update -qq
# - sudo apt-get install --yes gdal-bin jq
# - pip3 install -U SPARQLWrapper
# - pip3 install -U fiona
# - python -c "import fiona"
# - python -c "import fiona"
# - pip3 install -U csvtomd
# - pip3 install -U requests
# script:
@ -91,15 +91,15 @@ matrix:
# python: 3.5
# os: linux
# env:
# - nedocker=NO
# sudo: required
# - nedocker=NO
# sudo: required
# before_install:
# - sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes
# - sudo apt-get --yes --force-yes update -qq
# - sudo apt-get install --yes gdal-bin jq
# - pip3 install -U SPARQLWrapper
# - pip3 install -U fiona
# - python -c "import fiona"
# - python -c "import fiona"
# - pip3 install -U csvtomd
# - pip3 install -U requests
# script:
@ -109,11 +109,11 @@ matrix:
- services: docker
os: linux
env:
- nedocker=YES
sudo: required
- nedocker=YES
sudo: required
before_install:
- docker version
- docker build -t ne_py3wikidata .
- docker images
script:
script:
- docker run -it -v $(pwd):/ne ne_py3wikidata bash run_all.sh

View File

@ -1,7 +1,6 @@
#!/bin/bash
set -Eeuo pipefail
STARTDATE=$(date +"%Y-%m-%dT%H:%M%z")
# clean and recreate x_tempshape directory
@ -12,12 +11,13 @@ log_file=x_tempshape/run_all.log
exec &> >(tee -a "$log_file")
# Don't forget update the VERSION file!
cat VERSION
echo "-----------------------------------"
echo "Version $(cat VERSION)"
echo "Start: $STARTDATE "
# Show some debug info
python3 ./tools/wikidata/platform_debug_info.py
# Summary Log file
logmd=x_tempshape/update.md
rm -f $logmd
@ -27,59 +27,59 @@ rm -f $logmd
# LetterCase = uppercase --> variable names [WIKIDATAID, NAME_AR, NAME_BN, NAME_DE, NAME_EN, NAME_ES, ... ]
# LetterCase = lowercase --> variable names [wikidataid, name_ar, name_bn, name_de, name_en, name_es, ... ]
# --------------------------------------------------------------------------------------------------------------------
# |mode |LetterCase| shape_path | shape filename
# == 10m ================= |==== |==========| ============| ================================================
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_countries_lakes
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_countries
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_disputed_areas
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_map_subunits
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_map_units
./tools/wikidata/update.sh all uppercase 10m_cultural ne_10m_admin_0_sovereignty
./tools/wikidata/update.sh all lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh all lowercase 10m_cultural ne_10m_admin_1_states_provinces
./tools/wikidata/update.sh all lowercase 10m_cultural ne_10m_airports
./tools/wikidata/update.sh all lowercase 10m_cultural ne_10m_populated_places
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_geographic_lines
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_geography_marine_polys
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_geography_regions_elevation_points
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_geography_regions_points
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_geography_regions_polys
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_lakes_europe
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_lakes_historic
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_lakes_north_america
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_lakes
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_playas
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_rivers_europe
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_rivers_lake_centerlines
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_rivers_north_america
./tools/wikidata/update.sh all lowercase 10m_cultural ne_10m_admin_1_label_points_details
# == 50m ================= |==== |==========| ============| ================================================
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_sovereignty
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_countries
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_countries_lakes
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_map_units
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_map_subunits
./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_tiny_countries
#./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_breakaway_disputed_areas # KeyError: 'WIKIDATAID'
#./tools/wikidata/update.sh all uppercase 50m_cultural ne_50m_admin_0_breakaway_disputed_areas_scale_rank # KeyError: 'WIKIDATAID'
./tools/wikidata/update.sh all lowercase 50m_cultural ne_50m_admin_1_states_provinces
./tools/wikidata/update.sh all lowercase 50m_cultural ne_50m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh all lowercase 50m_physical ne_50m_lakes
./tools/wikidata/update.sh all lowercase 50m_physical ne_50m_lakes_historic
./tools/wikidata/update.sh all lowercase 50m_physical ne_50m_playas
./tools/wikidata/update.sh all lowercase 50m_physical ne_50m_rivers_lake_centerlines
./tools/wikidata/update.sh all lowercase 50m_physical ne_50m_rivers_lake_centerlines_scale_rank
# ==110m ================= |==== |==========| ============| ================================================
./tools/wikidata/update.sh all uppercase 110m_cultural ne_110m_admin_0_sovereignty
./tools/wikidata/update.sh all uppercase 110m_cultural ne_110m_admin_0_countries
./tools/wikidata/update.sh all uppercase 110m_cultural ne_110m_admin_0_countries_lakes
./tools/wikidata/update.sh all uppercase 110m_cultural ne_110m_admin_0_map_units
./tools/wikidata/update.sh all lowercase 110m_cultural ne_110m_admin_1_states_provinces
./tools/wikidata/update.sh all lowercase 110m_cultural ne_110m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh all lowercase 110m_physical ne_110m_lakes
./tools/wikidata/update.sh all lowercase 110m_physical ne_110m_rivers_lake_centerlines
# ======================== |==== |==========| ============| ================================================
# | mode |LetterCase| shape_path | shape filename
# == 10m ================= |=========== |==========| ============| ================================================
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries_lakes
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_countries
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_disputed_areas
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_subunits
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_map_units
./tools/wikidata/update.sh fetch_write uppercase 10m_cultural ne_10m_admin_0_sovereignty
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_states_provinces
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_airports
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_populated_places
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geographic_lines
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geography_marine_polys
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geography_regions_elevation_points
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geography_regions_points
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_geography_regions_polys
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_lakes_europe
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_lakes_historic
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_lakes_north_america
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_lakes
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_playas
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_europe
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines_scale_rank
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_lake_centerlines
./tools/wikidata/update.sh fetch_write lowercase 10m_physical ne_10m_rivers_north_america
./tools/wikidata/update.sh fetch_write lowercase 10m_cultural ne_10m_admin_1_label_points_details
# == 50m ================= |=========== |==========| ============| ================================================
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_sovereignty
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_countries
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_countries_lakes
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_map_units
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_map_subunits
./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_tiny_countries
#./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_breakaway_disputed_areas # KeyError: 'WIKIDATAID'
#./tools/wikidata/update.sh fetch_write uppercase 50m_cultural ne_50m_admin_0_breakaway_disputed_areas_scale_rank # KeyError: 'WIKIDATAID'
./tools/wikidata/update.sh fetch_write lowercase 50m_cultural ne_50m_admin_1_states_provinces
./tools/wikidata/update.sh fetch_write lowercase 50m_cultural ne_50m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh fetch_write lowercase 50m_physical ne_50m_lakes
./tools/wikidata/update.sh fetch_write lowercase 50m_physical ne_50m_lakes_historic
./tools/wikidata/update.sh fetch_write lowercase 50m_physical ne_50m_playas
./tools/wikidata/update.sh fetch_write lowercase 50m_physical ne_50m_rivers_lake_centerlines
./tools/wikidata/update.sh fetch_write lowercase 50m_physical ne_50m_rivers_lake_centerlines_scale_rank
# ==110m ================= |=========== |==========| ============| ================================================
./tools/wikidata/update.sh fetch_write uppercase 110m_cultural ne_110m_admin_0_sovereignty
./tools/wikidata/update.sh fetch_write uppercase 110m_cultural ne_110m_admin_0_countries
./tools/wikidata/update.sh fetch_write uppercase 110m_cultural ne_110m_admin_0_countries_lakes
./tools/wikidata/update.sh fetch_write uppercase 110m_cultural ne_110m_admin_0_map_units
./tools/wikidata/update.sh fetch_write lowercase 110m_cultural ne_110m_admin_1_states_provinces
./tools/wikidata/update.sh fetch_write lowercase 110m_cultural ne_110m_admin_1_states_provinces_lakes
./tools/wikidata/update.sh fetch_write lowercase 110m_physical ne_110m_lakes
./tools/wikidata/update.sh fetch_write lowercase 110m_physical ne_110m_rivers_lake_centerlines
# ======================== |=========== |==========| ============| ================================================
# show summary
cat x_tempshape/update.md
@ -87,8 +87,30 @@ cat x_tempshape/update.md
# list new files
ls -Gga x_tempshape/*/*
# Update shape files ( if everything is OK! )
cp -r x_tempshape/10m_cultural/* 10m_cultural/
cp -r x_tempshape/10m_physical/* 10m_physical/
cp -r x_tempshape/50m_cultural/* 50m_cultural/
cp -r x_tempshape/50m_physical/* 50m_physical/
cp -r x_tempshape/110m_cultural/* 110m_cultural/
cp -r x_tempshape/110m_physical/* 110m_physical/
# test copy mode ( write again .. )
./tools/wikidata/update.sh copy uppercase 10m_cultural ne_10m_admin_0_countries
# Run the final update process
make clean all
echo " "
echo " ---------------------"
STOPDATE=$(date +"%Y-%m-%dT%H:%M%z")
echo "Stop: $STARTDATE "
echo " see log file: "
ls -Gga $log_file
echo " "
echo " ---- end of run_all.sh ------ "
ls -Gga $log_file

View File

@ -9,7 +9,7 @@
#run from the project root ( expected 30-40 minutes )
# be careful this is running 'make all'
./run_all.sh
./run_all.sh
# Check the log file
cat x_tempshape/run_all.log
@ -34,7 +34,7 @@ x_tempshape/update.md
./tools/wikidata/update.sh all lowercase 10m_physical ne_10m_lakes_north_america
```
mode =
mode =
* fetch = fetch Wikidata Labels(names) via SPARQL - and create a csv file
* write = create a new temp Shape file with the new wikidata names
* fetch_write = fetch and write
@ -57,33 +57,171 @@ step by step
```
# ./tools/wikidata/update.sh fetch ...
`./tools/wikidata/update.sh fetch lowercase 10m_physical ne_10m_lakes_north_america`
* list input shape file variables
* query all wikida labels
* write output : `x_tempshape/10m_physical/ne_10m_lakes_north_america.new_names.csv`
Example log:
```log
$ ./tools/wikidata/update.sh fetch lowercase 10m_physical ne_10m_lakes_north_america
########## /tools/wikidata/update.sh parameters:
1: mode : fetch
2: nei_letter_case: lowercase
3: neo_path : x_tempshape
4: ne_shapepath : 10m_physical
5: ne_shapefile : ne_10m_lakes_north_america
Fetch wikidata labels
=================================
INFO: Open of `./10m_physical/ne_10m_lakes_north_america.shp'
using driver `ESRI Shapefile' successful.
Layer name: ne_10m_lakes_north_america
Geometry: Polygon
Feature Count: 1200
Extent: (-164.284110, 8.988349) - (-18.569997, 82.292487)
Layer SRS WKT:
GEOGCS["GCS_WGS_1984",
DATUM["WGS_1984",
SPHEROID["WGS_84",6378137,298.257223563]],
PRIMEM["Greenwich",0],
UNIT["Degree",0.017453292519943295],
AUTHORITY["EPSG","4326"]]
uident: Real (25.9)
featurecla: String (50.0)
name: String (100.0)
name_alt: String (100.0)
note: String (100.0)
scalerank: Integer64 (10.0)
min_zoom: Real (6.1)
min_label: Real (4.1)
label: String (254.0)
wikidataid: String (254.0)
name_ar: String (254.0)
name_bn: String (254.0)
name_de: String (254.0)
name_en: String (254.0)
name_es: String (254.0)
name_fr: String (254.0)
name_el: String (254.0)
name_hi: String (254.0)
name_hu: String (254.0)
name_id: String (254.0)
name_it: String (254.0)
name_ja: String (254.0)
name_ko: String (254.0)
name_nl: String (254.0)
name_pl: String (254.0)
name_pt: String (254.0)
name_ru: String (254.0)
name_sv: String (254.0)
name_tr: String (254.0)
name_vi: String (254.0)
name_zh: String (254.0)
wdid_score: Integer (1.0)
ne_id: Integer64 (10.0)
- Start fetching Natural-Earth wikidata labels via SPARQL query -
fetch: wd:Q6474657 ... wd:Q5594723
fetch: wd:Q5034223 ... wd:Q4208879
Redirected: Q22702352 Q1799606
fetch: wd:Q3114698 ... wd:Q595625
- JOB end -
created : x_tempshape/10m_physical/ne_10m_lakes_north_america.new_names.csv
```
### /temp_shape/10m_physical/ne_10m_lakes_north_america.changes_log.csv # Column changes - csv format
#### x_tempshape/10m_physical/ne_10m_lakes_north_america.new_names.csv
```
"wd_id","status","variable","value_old","value_new"
"Q1323525","NEWvalue","name_ko","","워싱턴 호"
"Q7356585","MODvalue","name_fr","William","William 'Bill' Dannelly Reservoir"
"Q15118728","NEWvalue","name_en","","Little Salmon Lake"
"Q7236081","NEWvalue","name_de","","Powell Lake"
"Q7236081","NEWvalue","name_es","","Powell Lake"
"Q7236081","NEWvalue","name_it","","Powell Lake"
"Q7236081","NEWvalue","name_nl","","Powell Lake"
"Q22702352","REDIRECT","wikidataid","Q22702352","Q1799606"
"Q22702352","MODvalue","name_de","lac Pusticamica","Lac Pusticamica"
"Q1800890","MODvalue","name_en","Lake Chemong","Chemong Lake"
"Q1800890","NEWvalue","name_sv","","Chemong Lake"
```csv
"wd_id","wd_id_new","population","name_ar","name_bn","name_de","name_en","name_es","name_fr","name_el","name_hi","name_hu","name_id","name_it","name_ja","name_ko","name_nl","name_pl","name_pt","name_ru","name_sv","name_tr","name_vi","name_zh"
"Q4397897","","","","","","Ross Barnett Reservoir","","","","","","","","","","","","","Росс Барнетт","","","",""
"Q1426999","","","","","Theodore Roosevelt Lake","Theodore Roosevelt Lake","","","","","","","","","","","","","Рузвельт","","","",""
"Q175554","","","","","Walker Lake","Walker Lake","","Walker Lake","","","Walker-tó","","","ウォーカー湖","","Walker Lake","","","Уокер","","","",""
"Q6908686","","","","","","Mooselookmeguntic Lake","","Mooselookmeguntic Lake","","","","","","","","","","","Муслукмегантик","","","",""
"Q1110527","","","","","Priest Lake","Priest Lake","","Priest Lake","","","","","","","","","","","Прист","","","",""
"Q1627906","","","","","","Caddo Lake","","lac Caddo","","","","","lago Caddo","","","Caddo Lake","","Lago Caddo","Каддо","","","",""
"Q4261031","","","","","","Lake Livingston","","lac Livingston","","","","","","","","","","","Ливингстон","","","",""
"Q4231229","","","","","","Lake Conroe","","Lake Conroe","","","","","","","","","","","Конро","","","",""
"Q2365354","","","","","Summer Lake","Summer Lake","","Summer Lake","","","","","","","","","","","Саммер","","","",""
...
```
### ./temp_shape/10m_physical/ne_10m_lakes_north_america.changes_log.csv.md # Column changes - markdown
# ./tools/wikidata/update.sh write ...
` ./tools/wikidata/update.sh write lowercase 10m_physical ne_10m_lakes_north_america`
* create new temp shapefile
* create some audits logs, statistics
```log
$ ./tools/wikidata/update.sh write lowercase 10m_physical ne_10m_lakes_north_america
########## /tools/wikidata/update.sh parameters:
1: mode : write
2: nei_letter_case: lowercase
3: neo_path : x_tempshape
4: ne_shapepath : 10m_physical
5: ne_shapefile : ne_10m_lakes_north_america
Write shapefile with wikidata labels
=================================
shapefile info : x_tempshape/10m_physical/ne_10m_lakes_north_america
name_en/NAME_EN changes x_tempshape/10m_physical/ne_10m_lakes_north_america)
---------------------
Q1800890 | MODvalue | name_en | Lake Chemong | Chemong Lake
shapefilename | var | value
-----------------------------------------------|--------------------------|-------
./10m_physical/ne_10m_lakes_north_america.shp | New_name | 12
./10m_physical/ne_10m_lakes_north_america.shp | Deleted_name | 0
./10m_physical/ne_10m_lakes_north_america.shp | Modified_name | 3
./10m_physical/ne_10m_lakes_north_america.shp | Empty_name | 7894
./10m_physical/ne_10m_lakes_north_america.shp | Same_name | 1604
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_redirected | 1
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notfound | 0
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_null | 747
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notnull | 453
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_badformated | 0
(write) created :
-------------------
-rw-r--r-- 1 942 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv
-rw-r--r-- 1 1393 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv.md
-rw-r--r-- 1 5 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.cpg
-rw-r--r-- 1 7499890 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.dbf
-rw-r--r-- 1 57604 May 20 19:23 x_tempshape/10m_physical/ne_10m_lakes_north_america.new_names.csv
-rw-r--r-- 1 143 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.prj
-rw-r--r-- 1 573424 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.shp
-rw-r--r-- 1 9700 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.shx
-rw-r--r-- 1 749 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv
-rw-r--r-- 1 967 May 20 19:27 x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv.md
```
#### write - audit log: x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv.md
```
$ cat x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv.md
wd_id | status | variable | value_old | value_new
-----------|------------|--------------|-------------------|-----------------------------------
Q1323525 | NEWvalue | name_ko | | 워싱턴 호
Q1323525 | NEWvalue | name_pl | | Washington
Q1495651 | NEWvalue | name_sv | | Lake George
Q1627906 | NEWvalue | name_pt | | Lago Caddo
Q7356585 | MODvalue | name_fr | William | William 'Bill' Dannelly Reservoir
Q13700 | NEWvalue | name_tr | | Texcoco Gölü
Q15118728 | NEWvalue | name_en | | Little Salmon Lake
Q16931868 | NEWvalue | name_sv | | Athapapuskow Lake
Q7236081 | NEWvalue | name_de | | Powell Lake
Q7236081 | NEWvalue | name_es | | Powell Lake
Q7236081 | NEWvalue | name_it | | Powell Lake
@ -92,65 +230,76 @@ Q22702352 | REDIRECT | wikidataid | Q22702352 | Q1799606
Q22702352 | MODvalue | name_de | lac Pusticamica | Lac Pusticamica
Q1800890 | MODvalue | name_en | Lake Chemong | Chemong Lake
Q1800890 | NEWvalue | name_sv | | Chemong Lake
### ./temp_shape/10m_physical/ne_10m_lakes_north_america.new_names.csv # input csv
```bash
$ cat ./temp_shape/10m_physical/ne_10m_lakes_north_america.new_names.csv | head
"wd_id","wd_id_new","population","name_ar","name_bn","name_de","name_en","name_es","name_fr","name_el","name_hi","name_hu","name_id","name_it","name_ja","name_ko","name_nl","name_pl","name_pt","name_ru","name_sv","name_tr","name_vi","name_zh"
"Q1426999","","","","","Theodore Roosevelt Lake","Theodore Roosevelt Lake","","","","","","","","","","","","","Рузвельт","","","",""
"Q4397897","","","","","","Ross Barnett Reservoir","","","","","","","","","","","","","Росс Барнетт","","","",""
"Q175554","","","","","Walker Lake","Walker Lake","","Walker Lake","","","Walker-tó","","","ウォーカー湖","","Walker Lake","","","Уокер","","","",""
"Q6908686","","","","","","Mooselookmeguntic Lake","","Mooselookmeguntic Lake","","","","","","","","","","","Муслукмегантик","","","",""
"Q1110527","","","","","Priest Lake","Priest Lake","","Priest Lake","","","","","","","","","","","Прист","","","",""
"Q1627906","","","","","","Caddo Lake","","lac Caddo","","","","","lago Caddo","","","Caddo Lake","","","Каддо","","","",""
"Q4261031","","","","","","Lake Livingston","","lac Livingston","","","","","","","","","","","Ливингстон","","","",""
"Q4231229","","","","","","Lake Conroe","","Lake Conroe","","","","","","","","","","","Конро","","","",""
"Q2365354","","","","","Summer Lake","Summer Lake","","Summer Lake","","","","","","","","","","","Саммер","","","",""
...
```
### ./temp_shape/10m_physical/ne_10m_lakes_north_america.summary_log.csv # Summary of the changes - csv
### write - summary audit log: x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv.md
```
$ cat x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv.md
shapefilename | var | value
-----------------------------------------------|--------------------------|-------
./10m_physical/ne_10m_lakes_north_america.shp | New_name | 12
./10m_physical/ne_10m_lakes_north_america.shp | Deleted_name | 0
./10m_physical/ne_10m_lakes_north_america.shp | Modified_name | 3
./10m_physical/ne_10m_lakes_north_america.shp | Empty_name | 7894
./10m_physical/ne_10m_lakes_north_america.shp | Same_name | 1604
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_redirected | 1
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notfound | 0
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_null | 747
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notnull | 453
./10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_badformated | 0
```csv
"shapefilename","var","value"
"10m_physical/ne_10m_lakes_north_america.shp","New_name","7"
"10m_physical/ne_10m_lakes_north_america.shp","Deleted_name","0"
"10m_physical/ne_10m_lakes_north_america.shp","Modified_name","3"
"10m_physical/ne_10m_lakes_north_america.shp","Empty_name ","7899"
"10m_physical/ne_10m_lakes_north_america.shp","Same_name","1604"
"10m_physical/ne_10m_lakes_north_america.shp","Wikidataid_redirected","1"
"10m_physical/ne_10m_lakes_north_america.shp","Wikidataid_notfound","0"
"10m_physical/ne_10m_lakes_north_america.shp","Wikidataid_null","747"
"10m_physical/ne_10m_lakes_north_america.shp","Wikidataid_notnull","453"
"10m_physical/ne_10m_lakes_north_america.shp","Wikidataid_badformated","0"
```
### ./temp_shape/10m_physical/ne_10m_lakes_north_america.shp.summary_log.csv.md # Summary of the changes - markdown
# ./tools/wikidata/update.sh copy ...
Be careful with copy mode!
`./tools/wikidata/update.sh copy lowercase 10m_physical ne_10m_lakes_north_america`
* copy the new files (shape + audit) to the original place
```
$ ./tools/wikidata/update.sh copy lowercase 10m_physical ne_10m_lakes_north_america
########## /tools/wikidata/update.sh parameters:
1: mode : copy
2: nei_letter_case: lowercase
3: neo_path : x_tempshape
4: ne_shapepath : 10m_physical
5: ne_shapefile : ne_10m_lakes_north_america
Copy shape + audit files
===============================
'x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv' -> '10m_physical/ne_10m_lakes_north_america.changes_log.csv'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.changes_log.csv.md' -> '10m_physical/ne_10m_lakes_north_america.changes_log.csv.md'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.cpg' -> '10m_physical/ne_10m_lakes_north_america.cpg'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.dbf' -> '10m_physical/ne_10m_lakes_north_america.dbf'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.new_names.csv' -> '10m_physical/ne_10m_lakes_north_america.new_names.csv'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.prj' -> '10m_physical/ne_10m_lakes_north_america.prj'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.shp' -> '10m_physical/ne_10m_lakes_north_america.shp'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.shx' -> '10m_physical/ne_10m_lakes_north_america.shx'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv' -> '10m_physical/ne_10m_lakes_north_america.summary_log.csv'
'x_tempshape/10m_physical/ne_10m_lakes_north_america.summary_log.csv.md' -> '10m_physical/ne_10m_lakes_north_america.summary_log.csv.md'
```
shapefilename | var | value
---------------------------------------------|--------------------------|-------
10m_physical/ne_10m_lakes_north_america.shp | New_name | 7
10m_physical/ne_10m_lakes_north_america.shp | Deleted_name | 0
10m_physical/ne_10m_lakes_north_america.shp | Modified_name | 3
10m_physical/ne_10m_lakes_north_america.shp | Empty_name | 7899
10m_physical/ne_10m_lakes_north_america.shp | Same_name | 1604
10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_redirected | 1
10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notfound | 0
10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_null | 747
10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_notnull | 453
10m_physical/ne_10m_lakes_north_america.shp | Wikidataid_badformated | 0
# My best practice ...
* Run step by step ( line by line) : `./run_all.sh` in `fetch_write` mode
* checkout the original shape files
* Run step by step ( line by line , table by table ) from the `./run_all.sh` in `fetch_write` mode
* check the audit csv files ( Open by Libreoffice , filter )
* find & fix the 'fake' wikidata changes :(
* iterate or modify input csv and write shape files
* check shape files and move the shape files to the correct folders
* check shape files
* if OK - move the shape files to the correct folders: `./tools/wikidata/update.sh copy ...`
# known problems
@ -164,8 +313,12 @@ WARNING:Fiona:CPLE_AppDefined in b'Value -3.34011000000000013 of field longitude
...
```
### uppercase / lowercase variable names
lettercase = uppercase variable names [WIKIDATAID, NAME_AR, NAME_BN, NAME_DE, NAME_EN, NAME_ES, ... ]
* 10m_cultural/ne_10m_admin_0_countries_lakes.shp
* 10m_cultural/ne_10m_admin_0_countries.shp
@ -199,7 +352,7 @@ lettercase = lowercase variable names [wikidataid, name_ar, name_bn, name_de, na
* 50m_cultural/...
* 50m_physical/...
* 110m_cultural/...
* 110m_physical/...
* 110m_physical/...
see the _latest_ information in the `./run_all.sh`
@ -230,3 +383,67 @@ variable name | language | language wikipedia link
NAME_VI | Vietnamese | https://en.wikipedia.org/wiki/Vietnamese_language
NAME_ZH | Chinese | https://en.wikipedia.org/wiki/Chinese_language
# Name cleaning
minimal regexp implementation, hard coded in the `write_wikidata.py`
TODO : need better implementation.
### remove `river`
if the shape file name contain trigger word ('river') - run regexp.
```python
riverclean_regex = re.compile(r'\b('+'River'+r')\b', flags=re.IGNORECASE)
....
if args.input_shape.lower().find('river') > 0:
wddic[qid][d] = riverclean_regex.sub('', wddic[qid][d])
....
```
changes written to the log.
```
Q1330818 name_en name cleaning : Pite River --> Pite
Q16663 name_en name cleaning : Alagón River --> Alagón
Q14764 name_en name cleaning : Esla river --> Esla
Q14755 name_en name cleaning : Tormes River --> Tormes
Q71122 name_en name cleaning : Chir River --> Chir
Q192157 name_en name cleaning : Belaya River --> Belaya
Q202796 name_en name cleaning : Desna River --> Desna
```
### remove `Municipality of|Municipality|First Nation` words
if the shape file name contain trigger word ('place') - run regexp.
```python
placeclean_regex = re.compile(r'\b('+'Municipality of|Municipality|First Nation'+r')\b',
flags=re.IGNORECASE)
...
# Places ...
if args.input_shape.lower().find('place') > 0:
wddic[qid][d] = placeclean_regex.sub('', wddic[qid][d])
... remove 市(city)
```
example changes:
```
Q3078079 name_en name cleaning : Fort Severn First Nation --> Fort Severn
Q3078079 name_nl name cleaning : Fort Severn First Nation --> Fort Severn
```
### remove 市(city)
example changes:
```
Q68695 name_zh name cleaning : 泉州市 --> 泉州
Q74881 name_zh name cleaning : 大连市 --> 大连
Q74957 name_zh name cleaning : 鞍山市 --> 鞍山
Q92381 name_zh name cleaning : 白城市 --> 白城
```

View File

@ -41,8 +41,8 @@ function fetch_names {
python3 tools/wikidata/fetch_wikidata.py -input_shape_name ${nei_path}/${ne_shapepath}/${ne_shapefile}.shp \
-input_lettercase ${nei_letter_case} \
-output_csv_name ${neo_path}/${ne_shapepath}/${ne_shapefile}.new_names.csv
echo " created : ${neo_path}/${ne_shapepath}/${ne_shapefile}.new_names.csv "
echo " "
echo " created : ${neo_path}/${ne_shapepath}/${ne_shapefile}.new_names.csv "
echo " "
}
@ -80,7 +80,7 @@ function write_names {
csvtomd ${neo_path}/${ne_shapepath}/${ne_shapefile}.summary_log.csv > ${neo_path}/${ne_shapepath}/${ne_shapefile}.summary_log.csv.md
echo " "
echo "name_en/NAME_EN changes ${neo_path}/${ne_shapepath}/${ne_shapefile}) "
echo "show only name_en/NAME_EN changes : ${neo_path}/${ne_shapepath}/${ne_shapefile} "
echo "---------------------"
cat ${neo_path}/${ne_shapepath}/${ne_shapefile}.changes_log.csv.md | grep MODvalue | grep name_en || true
cat ${neo_path}/${ne_shapepath}/${ne_shapefile}.changes_log.csv.md | grep MODvalue | grep NAME_EN || true
@ -89,8 +89,8 @@ function write_names {
cat ${neo_path}/${ne_shapepath}/${ne_shapefile}.summary_log.csv.md
echo " "
echo " (write) created :"
echo " -------------------"
echo " (write) created shape and audit files:"
echo " -------------------"
ls -Gga ${neo_path}/${ne_shapepath}/${ne_shapefile}*
echo ""
}
@ -106,30 +106,30 @@ function copy_names {
if [[ "$mode" == "fetch" ]]
then
#echo "fetch"
fetch_names
fetch_names
elif [[ "$mode" == "write" ]]
then
#echo "write"
write_names
elif [[ "$mode" == "fetch_write" ]]
then
write_names
elif [[ "$mode" == "fetch_write" ]]
then
#echo "fetch_write "
fetch_names
write_names
fetch_names
write_names
elif [[ "$mode" == "copy" ]]
then
#echo "copy files"
copy_names
elif [[ "$mode" == "all" ]]
then
copy_names
elif [[ "$mode" == "all" ]]
then
#echo "fetch + write + copy"
fetch_names
fetch_names
write_names
copy_names
else
copy_names
else
echo "Unknown mode! the first parameter should be:[fetch/write/fetch_write/copy/all]"
exit 1
fi
exit 1
fi
exit