[FEATURE][processing] New universal 'basic stats for field' algorithm

Replaces the existing 'Basic Stats for Numeric Fields' and
'Basic Stats for String Fields' algorithms and adds support
for date/time/datetime fields.

Having a single unified algorithm allows more flexible models
where a field type may not be known in advance.

Deprecate existing basic stats algorithms
This commit is contained in:
Nyall Dawson 2016-11-30 15:59:24 +10:00
parent 1ff165a32f
commit b30a1ff65d
13 changed files with 383 additions and 11 deletions

View File

@ -22,14 +22,10 @@ qgis:advancedpythonfieldcalculator: >
qgis:barplot:
qgis:basicstatisticsforfields: >
This algorithm generates basic statistics from the analysis of a values in a field in the attribute table of a vector layer. Numeric, date, time and string fields are supported.
qgis:basicstatisticsfornumericfields: >
This algorithm generates basic statistics from the analysis of a numeric field in the attribute table of a vector layer.
Statistics are generated as an HTML file.
qgis:basicstatisticsfortextfields: >
This algorithm generates basic statistics from the analysis of a text field in the attribute table of a vector layer.
The statistics returned will depend on the field type.
Statistics are generated as an HTML file.

View File

@ -0,0 +1,251 @@
# -*- coding: utf-8 -*-
"""
***************************************************************************
BasicStatistics.py
---------------------
Date : November 2016
Copyright : (C) 2016 by Nyall Dawson
Email : nyall dot dawson at gmail dot com
***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************
"""
__author__ = 'Nyall Dawson'
__date__ = 'November 2016'
__copyright__ = '(C) 2016, Nyall Dawson'
# This will get replaced with a git SHA1 when you do a git archive
__revision__ = '$Format:%H$'
import os
import codecs
from qgis.PyQt.QtCore import QVariant
from qgis.PyQt.QtGui import QIcon
from qgis.core import (QgsStatisticalSummary,
QgsStringStatisticalSummary,
QgsDateTimeStatisticalSummary,
QgsFeatureRequest)
from processing.core.GeoAlgorithm import GeoAlgorithm
from processing.core.parameters import ParameterTable
from processing.core.parameters import ParameterTableField
from processing.core.outputs import OutputHTML
from processing.core.outputs import OutputNumber
from processing.tools import dataobjects, vector
pluginPath = os.path.split(os.path.split(os.path.dirname(__file__))[0])[0]
class BasicStatisticsForField(GeoAlgorithm):
INPUT_LAYER = 'INPUT_LAYER'
FIELD_NAME = 'FIELD_NAME'
OUTPUT_HTML_FILE = 'OUTPUT_HTML_FILE'
MIN = 'MIN'
MAX = 'MAX'
COUNT = 'COUNT'
UNIQUE = 'UNIQUE'
EMPTY = 'EMPTY'
FILLED = 'FILLED'
MIN_LENGTH = 'MIN_LENGTH'
MAX_LENGTH = 'MAX_LENGTH'
MEAN_LENGTH = 'MEAN_LENGTH'
CV = 'CV'
SUM = 'SUM'
MEAN = 'MEAN'
STD_DEV = 'STD_DEV'
RANGE = 'RANGE'
MEDIAN = 'MEDIAN'
MINORITY = 'MINORITY'
MAJORITY = 'MAJORITY'
FIRSTQUARTILE = 'FIRSTQUARTILE'
THIRDQUARTILE = 'THIRDQUARTILE'
IQR = 'IQR'
def getIcon(self):
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
def defineCharacteristics(self):
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for fields')
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
self.tags = self.tr('stats,statistics,date,time,datetime,string,number,text,table,layer,maximum,minimum,mean,average,standard,deviation,'
'count,distinct,unique,variance,median,quartile,range,majority,minority')
self.addParameter(ParameterTable(self.INPUT_LAYER,
self.tr('Input table')))
self.addParameter(ParameterTableField(self.FIELD_NAME,
self.tr('Field to calculate statistics on'),
self.INPUT_LAYER))
self.addOutput(OutputHTML(self.OUTPUT_HTML_FILE,
self.tr('Statistics')))
self.addOutput(OutputNumber(self.COUNT, self.tr('Count')))
self.addOutput(OutputNumber(self.UNIQUE, self.tr('Number of unique values')))
self.addOutput(OutputNumber(self.EMPTY, self.tr('Number of empty (null) values')))
self.addOutput(OutputNumber(self.FILLED, self.tr('Number of non-empty values')))
self.addOutput(OutputNumber(self.MIN, self.tr('Minimum value')))
self.addOutput(OutputNumber(self.MAX, self.tr('Maximum value')))
self.addOutput(OutputNumber(self.MIN_LENGTH, self.tr('Minimum length')))
self.addOutput(OutputNumber(self.MAX_LENGTH, self.tr('Maximum length')))
self.addOutput(OutputNumber(self.MEAN_LENGTH, self.tr('Mean length')))
self.addOutput(OutputNumber(self.CV, self.tr('Coefficient of Variation')))
self.addOutput(OutputNumber(self.SUM, self.tr('Sum')))
self.addOutput(OutputNumber(self.MEAN, self.tr('Mean value')))
self.addOutput(OutputNumber(self.STD_DEV, self.tr('Standard deviation')))
self.addOutput(OutputNumber(self.RANGE, self.tr('Range')))
self.addOutput(OutputNumber(self.MEDIAN, self.tr('Median')))
self.addOutput(OutputNumber(self.MINORITY, self.tr('Minority (rarest occurring value)')))
self.addOutput(OutputNumber(self.MAJORITY, self.tr('Majority (most frequently occurring value)')))
self.addOutput(OutputNumber(self.FIRSTQUARTILE, self.tr('First quartile')))
self.addOutput(OutputNumber(self.THIRDQUARTILE, self.tr('Third quartile')))
self.addOutput(OutputNumber(self.IQR, self.tr('Interquartile Range (IQR)')))
def processAlgorithm(self, progress):
layer = dataobjects.getObjectFromUri(
self.getParameterValue(self.INPUT_LAYER))
field_name = self.getParameterValue(self.FIELD_NAME)
field = layer.fields().at(layer.fields().lookupField(field_name))
output_file = self.getOutputValue(self.OUTPUT_HTML_FILE)
request = QgsFeatureRequest().setFlags(QgsFeatureRequest.NoGeometry).setSubsetOfAttributes([field_name], layer.fields())
features = vector.features(layer, request)
data = []
data.append(self.tr('Analyzed layer: {}').format(layer.name()))
data.append(self.tr('Analyzed field: {}').format(field_name))
if field.isNumeric():
data.extend(self.calcNumericStats(features, progress, field))
elif field.type() in (QVariant.Date, QVariant.Time, QVariant.DateTime):
data.extend(self.calcDateTimeStats(features, progress, field))
else:
data.extend(self.calcStringStats(features, progress, field))
self.createHTML(output_file, data)
def calcNumericStats(self, features, progress, field):
count = len(features)
total = 100.0 / float(count)
stat = QgsStatisticalSummary()
for current, ft in enumerate(features):
stat.addVariant(ft[field.name()])
progress.setPercentage(int(current * total))
stat.finalize()
cv = stat.stDev() / stat.mean() if stat.mean() != 0 else 0
self.setOutputValue(self.COUNT, stat.count())
self.setOutputValue(self.UNIQUE, stat.variety())
self.setOutputValue(self.EMPTY, stat.countMissing())
self.setOutputValue(self.FILLED, count - stat.countMissing())
self.setOutputValue(self.MIN, stat.min())
self.setOutputValue(self.MAX, stat.max())
self.setOutputValue(self.RANGE, stat.range())
self.setOutputValue(self.SUM, stat.sum())
self.setOutputValue(self.MEAN, stat.mean())
self.setOutputValue(self.MEDIAN, stat.median())
self.setOutputValue(self.STD_DEV, stat.stDev())
self.setOutputValue(self.CV, cv)
self.setOutputValue(self.MINORITY, stat.minority())
self.setOutputValue(self.MAJORITY, stat.majority())
self.setOutputValue(self.FIRSTQUARTILE, stat.firstQuartile())
self.setOutputValue(self.THIRDQUARTILE, stat.thirdQuartile())
self.setOutputValue(self.IQR, stat.interQuartileRange())
data = []
data.append(self.tr('Count: {}').format(stat.count()))
data.append(self.tr('Unique values: {}').format(stat.variety()))
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
data.append(self.tr('Minimum value: {}').format(stat.min()))
data.append(self.tr('Maximum value: {}').format(stat.max()))
data.append(self.tr('Range: {}').format(stat.range()))
data.append(self.tr('Sum: {}').format(stat.sum()))
data.append(self.tr('Mean value: {}').format(stat.mean()))
data.append(self.tr('Median value: {}').format(stat.median()))
data.append(self.tr('Standard deviation: {}').format(stat.stDev()))
data.append(self.tr('Coefficient of Variation: {}').format(cv))
data.append(self.tr('Minority (rarest occurring value): {}').format(stat.minority()))
data.append(self.tr('Majority (most frequently occurring value): {}').format(stat.majority()))
data.append(self.tr('First quartile: {}').format(stat.firstQuartile()))
data.append(self.tr('Third quartile: {}').format(stat.thirdQuartile()))
data.append(self.tr('Interquartile Range (IQR): {}').format(stat.interQuartileRange()))
return data
def calcStringStats(self, features, progress, field):
count = len(features)
total = 100.0 / float(count)
stat = QgsStringStatisticalSummary()
for current, ft in enumerate(features):
stat.addValue(ft[field.name()])
progress.setPercentage(int(current * total))
stat.finalize()
self.setOutputValue(self.COUNT, stat.count())
self.setOutputValue(self.UNIQUE, stat.countDistinct())
self.setOutputValue(self.EMPTY, stat.countMissing())
self.setOutputValue(self.FILLED, stat.count() - stat.countMissing())
self.setOutputValue(self.MIN, stat.min())
self.setOutputValue(self.MAX, stat.max())
self.setOutputValue(self.MIN_LENGTH, stat.minLength())
self.setOutputValue(self.MAX_LENGTH, stat.maxLength())
self.setOutputValue(self.MEAN_LENGTH, stat.meanLength())
data = []
data.append(self.tr('Count: {}').format(count))
data.append(self.tr('Unique values: {}').format(stat.countDistinct()))
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
data.append(self.tr('Minimum value: {}').format(stat.min()))
data.append(self.tr('Maximum value: {}').format(stat.max()))
data.append(self.tr('Minimum length: {}').format(stat.minLength()))
data.append(self.tr('Maximum length: {}').format(stat.maxLength()))
data.append(self.tr('Mean length: {}').format(stat.meanLength()))
return data
def calcDateTimeStats(self, features, progress, field):
count = len(features)
total = 100.0 / float(count)
stat = QgsDateTimeStatisticalSummary()
for current, ft in enumerate(features):
stat.addValue(ft[field.name()])
progress.setPercentage(int(current * total))
stat.finalize()
self.setOutputValue(self.COUNT, stat.count())
self.setOutputValue(self.UNIQUE, stat.countDistinct())
self.setOutputValue(self.EMPTY, stat.countMissing())
self.setOutputValue(self.FILLED, stat.count() - stat.countMissing())
self.setOutputValue(self.MIN, stat.statistic(QgsDateTimeStatisticalSummary.Min))
self.setOutputValue(self.MAX, stat.statistic(QgsDateTimeStatisticalSummary.Max))
data = []
data.append(self.tr('Count: {}').format(count))
data.append(self.tr('Unique values: {}').format(stat.countDistinct()))
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
data.append(self.tr('Minimum value: {}').format(field.displayString(stat.statistic(QgsDateTimeStatisticalSummary.Min))))
data.append(self.tr('Maximum value: {}').format(field.displayString(stat.statistic(QgsDateTimeStatisticalSummary.Max))))
return data
def createHTML(self, outputFile, algData):
with codecs.open(outputFile, 'w', encoding='utf-8') as f:
f.write('<html><head>\n')
f.write('<meta http-equiv="Content-Type" content="text/html; \
charset=utf-8" /></head><body>\n')
for s in algData:
f.write('<p>' + str(s) + '</p>\n')
f.write('</body></html>\n')

View File

@ -68,12 +68,18 @@ class BasicStatisticsNumbers(GeoAlgorithm):
NULLVALUES = 'NULLVALUES'
IQR = 'IQR'
def __init__(self):
GeoAlgorithm.__init__(self)
# this algorithm is deprecated - use BasicStatistics instead
self.showInToolbox = False
def getIcon(self):
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
def defineCharacteristics(self):
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for numeric fields')
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
self.tags = self.tr('stats,statistics,number,table,layer')
self.addParameter(ParameterTable(self.INPUT_LAYER,
self.tr('Input vector layer')))

View File

@ -60,12 +60,18 @@ class BasicStatisticsStrings(GeoAlgorithm):
MIN_VALUE = 'MIN_VALUE'
MAX_VALUE = 'MAX_VALUE'
def __init__(self):
GeoAlgorithm.__init__(self)
# this algorithm is deprecated - use BasicStatistics instead
self.showInToolbox = False
def getIcon(self):
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
def defineCharacteristics(self):
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for text fields')
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
self.tags = self.tr('stats,statistics,string,table,layer')
self.addParameter(ParameterTable(self.INPUT_LAYER,
self.tr('Input vector layer')))

View File

@ -180,6 +180,7 @@ from .SnapGeometries import SnapGeometriesToLayer
from .PoleOfInaccessibility import PoleOfInaccessibility
from .CreateAttributeIndex import CreateAttributeIndex
from .DropGeometry import DropGeometry
from .BasicStatistics import BasicStatisticsForField
pluginPath = os.path.normpath(os.path.join(
os.path.split(os.path.dirname(__file__))[0], os.pardir))
@ -243,7 +244,8 @@ class QGISAlgorithmProvider(AlgorithmProvider):
TinInterpolationZValue(), TinInterpolationAttribute(),
RemoveNullGeometry(), ExtractByExpression(), ExtendLines(),
ExtractSpecificNodes(), GeometryByExpression(), SnapGeometriesToLayer(),
PoleOfInaccessibility(), CreateAttributeIndex(), DropGeometry()
PoleOfInaccessibility(), CreateAttributeIndex(), DropGeometry(),
BasicStatisticsForField()
]
if hasMatplotlib:

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,10 @@
!table
!version 900
!charset Neutral
Definition Table
Type NATIVE Charset "Neutral"
Fields 3
date Date ;
time Time ;
date_time DateTime ;

View File

@ -0,0 +1,10 @@
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
<p>Analyzed layer: custom/datetimes.tab</p>
<p>Analyzed field: date</p>
<p>Count: 4</p>
<p>Unique values: 3</p>
<p>NULL (missing) values: 1</p>
<p>Minimum value: 2014-11-30T00:00:00</p>
<p>Maximum value: 2016-11-30T00:00:00</p>
</body></html>

View File

@ -0,0 +1,10 @@
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
<p>Analyzed layer: custom/datetimes.tab</p>
<p>Analyzed field: date_time</p>
<p>Count: 4</p>
<p>Unique values: 3</p>
<p>NULL (missing) values: 1</p>
<p>Minimum value: 2014-11-30T14:30:02</p>
<p>Maximum value: 2016-11-30T14:29:22</p>
</body></html>

View File

@ -0,0 +1,10 @@
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
<p>Analyzed layer: custom/datetimes.tab</p>
<p>Analyzed field: time</p>
<p>Count: 4</p>
<p>Unique values: 3</p>
<p>NULL (missing) values: 1</p>
<p>Minimum value: 03:29:40</p>
<p>Maximum value: 15:29:22</p>
</body></html>

View File

@ -152,7 +152,7 @@ tests:
fields:
fid: skip
- algorithm: qgis:basicstatisticsfornumericfields
- algorithm: qgis:basicstatisticsforfields
name: Basic statistics for numeric fields
params:
- name: multipolys.gml
@ -182,7 +182,7 @@ tests:
- 'NULL \(missing\) values: 1'
- 'Interquartile Range \(IQR\): 0.123'
- algorithm: qgis:basicstatisticsfortextfields
- algorithm: qgis:basicstatisticsforfields
name: Basic statistics for text fields
params:
- name: multipolys.gml
@ -191,7 +191,18 @@ tests:
results:
OUTPUT_HTML_FILE:
name: expected/basic_statistics_string.html
type: file
type: regex
rules:
- 'Analyzed layer: multipolys.gml'
- 'Analyzed field: Bname'
- 'Count: 4'
- 'Unique values: 2'
- 'Minimum value: Test'
- 'Maximum value: Test'
- 'Minimum length: 0'
- 'Maximum length: 4'
- 'Mean length: 3.0'
- 'NULL \(missing\) values: 1'
# Split lines with lines considers two cases
# case 1: two different layers
@ -1753,3 +1764,63 @@ tests:
OUTPUT:
name: expected/removed_holes_min_area.gml
type: vector
- algorithm: qgis:basicstatisticsforfields
name: Basic stats datetime
params:
FIELD_NAME: date_time
INPUT_LAYER:
name: custom/datetimes.tab
type: table
results:
OUTPUT_HTML_FILE:
name: expected/basic_statistics_datetime.html
type: regex
rules:
- 'Analyzed layer: custom/datetimes.tab'
- 'Analyzed field: date_time'
- 'Count: 4'
- 'Unique values: 3'
- 'Minimum value: 2014-11-30T14:30:02'
- 'Maximum value: 2016-11-30T14:29:22'
- 'NULL \(missing\) values: 1'
- algorithm: qgis:basicstatisticsforfields
name: Basic stats date
params:
FIELD_NAME: date
INPUT_LAYER:
name: custom/datetimes.tab
type: table
results:
OUTPUT_HTML_FILE:
name: expected/basic_statistics_date.html
type: regex
rules:
- 'Analyzed layer: custom/datetimes.tab'
- 'Analyzed field: date'
- 'Count: 4'
- 'Unique values: 3'
- 'Minimum value: 2014-11-30T00:00:00'
- 'Maximum value: 2016-11-30T00:00:00'
- 'NULL \(missing\) values: 1'
- algorithm: qgis:basicstatisticsforfields
name: Basic stats time
params:
FIELD_NAME: time
INPUT_LAYER:
name: custom/datetimes.tab
type: table
results:
OUTPUT_HTML_FILE:
name: expected/basic_statistics_time.html
type: regex
rules:
- 'Analyzed layer: custom/datetimes.tab'
- 'Analyzed field: time'
- 'Count: 4'
- 'Unique values: 3'
- 'Minimum value: 03:29:40'
- 'Maximum value: 15:29:22'
- 'NULL \(missing\) values: 1'