QGIS/python/plugins/processing/algs/qgis/StatisticsByCategories.py
2024-11-29 15:38:02 +01:00

373 lines
13 KiB
Python

"""
***************************************************************************
StatisticsByCategories.py
---------------------
Date : September 2012
Copyright : (C) 2012 by Victor Olaya
Email : volayaf at gmail dot com
***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************
"""
__author__ = "Victor Olaya"
__date__ = "September 2012"
__copyright__ = "(C) 2012, Victor Olaya"
from qgis.core import (
QgsProcessingParameterFeatureSource,
QgsStatisticalSummary,
QgsDateTimeStatisticalSummary,
QgsStringStatisticalSummary,
QgsFeatureRequest,
QgsApplication,
QgsProcessingException,
QgsProcessingParameterField,
QgsProcessingParameterFeatureSink,
QgsFields,
QgsField,
QgsWkbTypes,
QgsCoordinateReferenceSystem,
QgsFeature,
QgsFeatureSink,
QgsProcessing,
QgsProcessingFeatureSource,
NULL,
)
from qgis.PyQt.QtCore import QMetaType
from processing.algs.qgis.QgisAlgorithm import QgisAlgorithm
from collections import defaultdict
class StatisticsByCategories(QgisAlgorithm):
INPUT = "INPUT"
VALUES_FIELD_NAME = "VALUES_FIELD_NAME"
CATEGORIES_FIELD_NAME = "CATEGORIES_FIELD_NAME"
OUTPUT = "OUTPUT"
def group(self):
return self.tr("Vector analysis")
def groupId(self):
return "vectoranalysis"
def tags(self):
return self.tr(
"groups,stats,statistics,table,layer,sum,maximum,minimum,mean,average,standard,deviation,"
"count,distinct,unique,variance,median,quartile,range,majority,minority,histogram,distinct,summary"
).split(",")
def icon(self):
return QgsApplication.getThemeIcon("/algorithms/mAlgorithmBasicStatistics.svg")
def svgIconPath(self):
return QgsApplication.iconPath("/algorithms/mAlgorithmBasicStatistics.svg")
def __init__(self):
super().__init__()
def initAlgorithm(self, config=None):
self.addParameter(
QgsProcessingParameterFeatureSource(
self.INPUT,
self.tr("Input vector layer"),
types=[QgsProcessing.SourceType.TypeVector],
)
)
self.addParameter(
QgsProcessingParameterField(
self.VALUES_FIELD_NAME,
self.tr(
"Field to calculate statistics on (if empty, only count is calculated)"
),
parentLayerParameterName=self.INPUT,
optional=True,
)
)
self.addParameter(
QgsProcessingParameterField(
self.CATEGORIES_FIELD_NAME,
self.tr("Field(s) with categories"),
parentLayerParameterName=self.INPUT,
type=QgsProcessingParameterField.DataType.Any,
allowMultiple=True,
)
)
self.addParameter(
QgsProcessingParameterFeatureSink(
self.OUTPUT, self.tr("Statistics by category")
)
)
def name(self):
return "statisticsbycategories"
def displayName(self):
return self.tr("Statistics by categories")
def processAlgorithm(self, parameters, context, feedback):
source = self.parameterAsSource(parameters, self.INPUT, context)
if source is None:
raise QgsProcessingException(
self.invalidSourceError(parameters, self.INPUT)
)
value_field_name = self.parameterAsString(
parameters, self.VALUES_FIELD_NAME, context
)
category_field_names = self.parameterAsFields(
parameters, self.CATEGORIES_FIELD_NAME, context
)
value_field_index = source.fields().lookupField(value_field_name)
if value_field_index >= 0:
value_field = source.fields().at(value_field_index)
else:
value_field = None
category_field_indexes = list()
# generate output fields
fields = QgsFields()
for field_name in category_field_names:
c = source.fields().lookupField(field_name)
if c == -1:
raise QgsProcessingException(
self.tr('Field "{field_name}" does not exist.').format(
field_name=field_name
)
)
category_field_indexes.append(c)
fields.append(source.fields().at(c))
def addField(name):
"""
Adds a field to the output, keeping the same data type as the value_field
"""
field = QgsField(value_field)
field.setName(name)
fields.append(field)
if value_field is None:
field_type = "none"
fields.append(QgsField("count", QMetaType.Type.Int))
elif value_field.isNumeric():
field_type = "numeric"
fields.append(QgsField("count", QMetaType.Type.Int))
fields.append(QgsField("unique", QMetaType.Type.Int))
fields.append(QgsField("min", QMetaType.Type.Double))
fields.append(QgsField("max", QMetaType.Type.Double))
fields.append(QgsField("range", QMetaType.Type.Double))
fields.append(QgsField("sum", QMetaType.Type.Double))
fields.append(QgsField("mean", QMetaType.Type.Double))
fields.append(QgsField("median", QMetaType.Type.Double))
fields.append(QgsField("stddev", QMetaType.Type.Double))
fields.append(QgsField("minority", QMetaType.Type.Double))
fields.append(QgsField("majority", QMetaType.Type.Double))
fields.append(QgsField("q1", QMetaType.Type.Double))
fields.append(QgsField("q3", QMetaType.Type.Double))
fields.append(QgsField("iqr", QMetaType.Type.Double))
elif value_field.type() in (
QMetaType.Type.QDate,
QMetaType.Type.QTime,
QMetaType.Type.QDateTime,
):
field_type = "datetime"
fields.append(QgsField("count", QMetaType.Type.Int))
fields.append(QgsField("unique", QMetaType.Type.Int))
fields.append(QgsField("empty", QMetaType.Type.Int))
fields.append(QgsField("filled", QMetaType.Type.Int))
# keep same data type for these fields
addField("min")
addField("max")
else:
field_type = "string"
fields.append(QgsField("count", QMetaType.Type.Int))
fields.append(QgsField("unique", QMetaType.Type.Int))
fields.append(QgsField("empty", QMetaType.Type.Int))
fields.append(QgsField("filled", QMetaType.Type.Int))
# keep same data type for these fields
addField("min")
addField("max")
fields.append(QgsField("min_length", QMetaType.Type.Int))
fields.append(QgsField("max_length", QMetaType.Type.Int))
fields.append(QgsField("mean_length", QMetaType.Type.Double))
request = QgsFeatureRequest().setFlags(QgsFeatureRequest.Flag.NoGeometry)
if value_field is not None:
attrs = [value_field_index]
else:
attrs = []
attrs.extend(category_field_indexes)
request.setSubsetOfAttributes(attrs)
features = source.getFeatures(
request, QgsProcessingFeatureSource.Flag.FlagSkipGeometryValidityChecks
)
total = 50.0 / source.featureCount() if source.featureCount() else 0
if field_type == "none":
values = defaultdict(int)
else:
values = defaultdict(list)
for current, feat in enumerate(features):
if feedback.isCanceled():
break
feedback.setProgress(int(current * total))
attrs = feat.attributes()
cat = tuple([attrs[c] for c in category_field_indexes])
if field_type == "none":
values[cat] += 1
continue
if field_type == "numeric":
if attrs[value_field_index] == NULL:
continue
else:
value = float(attrs[value_field_index])
elif field_type == "string":
if attrs[value_field_index] == NULL:
value = ""
else:
value = str(attrs[value_field_index])
elif attrs[value_field_index] == NULL:
value = NULL
else:
value = attrs[value_field_index]
values[cat].append(value)
(sink, dest_id) = self.parameterAsSink(
parameters,
self.OUTPUT,
context,
fields,
QgsWkbTypes.Type.NoGeometry,
QgsCoordinateReferenceSystem(),
)
if sink is None:
raise QgsProcessingException(self.invalidSinkError(parameters, self.OUTPUT))
if field_type == "none":
self.saveCounts(values, sink, feedback)
elif field_type == "numeric":
self.calcNumericStats(values, sink, feedback)
elif field_type == "datetime":
self.calcDateTimeStats(values, sink, feedback)
else:
self.calcStringStats(values, sink, feedback)
sink.finalize()
return {self.OUTPUT: dest_id}
def saveCounts(self, values, sink, feedback):
total = 50.0 / len(values) if values else 0
current = 0
for cat, v in values.items():
if feedback.isCanceled():
break
feedback.setProgress(int(current * total) + 50)
f = QgsFeature()
f.setAttributes(list(cat) + [v])
sink.addFeature(f, QgsFeatureSink.Flag.FastInsert)
current += 1
def calcNumericStats(self, values, sink, feedback):
stat = QgsStatisticalSummary()
total = 50.0 / len(values) if values else 0
current = 0
for cat, v in values.items():
if feedback.isCanceled():
break
feedback.setProgress(int(current * total) + 50)
stat.calculate(v)
f = QgsFeature()
f.setAttributes(
list(cat)
+ [
stat.count(),
stat.variety(),
stat.min(),
stat.max(),
stat.range(),
stat.sum(),
stat.mean(),
stat.median(),
stat.stDev(),
stat.minority(),
stat.majority(),
stat.firstQuartile(),
stat.thirdQuartile(),
stat.interQuartileRange(),
]
)
sink.addFeature(f, QgsFeatureSink.Flag.FastInsert)
current += 1
def calcDateTimeStats(self, values, sink, feedback):
stat = QgsDateTimeStatisticalSummary()
total = 50.0 / len(values) if values else 0
current = 0
for cat, v in values.items():
if feedback.isCanceled():
break
feedback.setProgress(int(current * total) + 50)
stat.calculate(v)
f = QgsFeature()
f.setAttributes(
list(cat)
+ [
stat.count(),
stat.countDistinct(),
stat.countMissing(),
stat.count() - stat.countMissing(),
stat.statistic(QgsDateTimeStatisticalSummary.Statistic.Min),
stat.statistic(QgsDateTimeStatisticalSummary.Statistic.Max),
]
)
sink.addFeature(f, QgsFeatureSink.Flag.FastInsert)
current += 1
def calcStringStats(self, values, sink, feedback):
stat = QgsStringStatisticalSummary()
total = 50.0 / len(values) if values else 0
current = 0
for cat, v in values.items():
if feedback.isCanceled():
break
feedback.setProgress(int(current * total) + 50)
stat.calculate(v)
f = QgsFeature()
f.setAttributes(
list(cat)
+ [
stat.count(),
stat.countDistinct(),
stat.countMissing(),
stat.count() - stat.countMissing(),
stat.min(),
stat.max(),
stat.minLength(),
stat.maxLength(),
stat.meanLength(),
]
)
sink.addFeature(f, QgsFeatureSink.Flag.FastInsert)
current += 1