New class QgsStringStatisticalSummary, for calculating statistics

on lists of strings
This commit is contained in:
Nyall Dawson 2016-05-09 15:33:29 +10:00
parent ceba5264f7
commit 0493cbfc21
7 changed files with 508 additions and 0 deletions

View File

@ -127,6 +127,7 @@
%Include qgssnappingutils.sip
%Include qgsspatialindex.sip
%Include qgsstatisticalsummary.sip
%Include qgsstringstatisticalsummary.sip
%Include qgsstringutils.sip
%Include qgstolerance.sip
%Include qgstracer.sip

View File

@ -0,0 +1,119 @@
/** \ingroup core
* \class QgsStringStatisticalSummary
* \brief Calculator for summary statistics and aggregates for a list of strings.
*
* Statistics are calculated by calling @link calculate @endlink and passing a list of strings. The
* individual statistics can then be retrieved using the associated methods. Note that not all statistics
* are calculated by default. Statistics which require slower computations are only calculated by
* specifying the statistic in the constructor or via @link setStatistics @endlink.
*
* \note Added in version 2.16
*/
class QgsStringStatisticalSummary
{
%TypeHeaderCode
#include <qgsstringstatisticalsummary.h>
%End
public:
public:
//! Enumeration of flags that specify statistics to be calculated
enum Statistic
{
Count, //!< Count
CountDistinct, //!< Number of distinct string values
CountMissing, //!< Number of missing (null) values
Min, //!< Minimum string value
Max, //!< Maximum string value
MinimumLength, //!< Minimum length of string
MaximumLength, //!< Maximum length of string
All, //! All statistics
};
typedef QFlags<QgsStringStatisticalSummary::Statistic> Statistics;
/** Constructor for QgsStringStatistics
* @param stats flags for statistics to calculate
*/
QgsStringStatisticalSummary( const QgsStringStatisticalSummary::Statistics& stats = All );
/** Returns flags which specify which statistics will be calculated. Some statistics
* are always calculated (eg count).
* @see setStatistics
*/
Statistics statistics() const;
/** Sets flags which specify which statistics will be calculated. Some statistics
* are always calculated (eg count).
* @param stats flags for statistics to calculate
* @see statistics
*/
void setStatistics( const Statistics& stats );
/** Resets the calculated values
*/
void reset();
/** Calculates summary statistics for a list of strings.
* @param values list of strings
*/
void calculate( const QStringList& values );
/** Calculates summary statistics for a list of variants. Any non-string variants will be
* ignored.
* @param values list of variants
*/
void calculate( const QVariantList& values );
/** Returns the value of a specified statistic
* @param stat statistic to return
* @returns calculated value of statistic
*/
QVariant statistic( Statistic stat ) const;
/** Returns the calculated count of values.
*/
int count() const;
/** Returns the number of distinct string values.
* @see distinctValues()
*/
int countDistinct() const;
/** Returns the set of distinct string values.
* @see countDistinct()
*/
QSet< QString > distinctValues() const;
/** Returns the number of missing (null) string values.
*/
int countMissing() const;
/** Returns the minimum (non-null) string value.
*/
QString min() const;
/** Returns the maximum (non-null) string value.
*/
QString max() const;
/** Returns the minimum length of strings.
*/
int minLength() const;
/** Returns the maximum length of strings.
*/
int maxLength() const;
/** Returns the friendly display name for a statistic
* @param statistic statistic to return name for
*/
static QString displayName( Statistic statistic );
};
QFlags<QgsStringStatisticalSummary::Statistic> operator|(QgsStringStatisticalSummary::Statistic f1, QFlags<QgsStringStatisticalSummary::Statistic> f2);

View File

@ -194,6 +194,7 @@ SET(QGIS_CORE_SRCS
qgssqlexpressioncompiler.cpp
qgssqliteexpressioncompiler.cpp
qgsstatisticalsummary.cpp
qgsstringstatisticalsummary.cpp
qgsstringutils.cpp
qgstextlabelfeature.cpp
qgstolerance.cpp
@ -689,6 +690,7 @@ SET(QGIS_CORE_HDRS
qgsspatialindex.h
qgssqlexpressioncompiler.h
qgsstatisticalsummary.h
qgsstringstatisticalsummary.h
qgsstringutils.h
qgstextlabelfeature.h
qgstolerance.h

View File

@ -0,0 +1,154 @@
/***************************************************************************
qgsstringstatisticalsummary.cpp
-------------------------------
Date : May 2016
Copyright : (C) 2016 by Nyall Dawson
Email : nyall dot dawson at gmail dot com
***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
#include "qgsstringstatisticalsummary.h"
#include <QString>
#include <QStringList>
#include <QObject>
#include <QVariant>
#include <QVariantList>
#include "limits.h"
/***************************************************************************
* This class is considered CRITICAL and any change MUST be accompanied with
* full unit tests in test_qgsstringstatisticalsummary.py.
* See details in QEP #17
****************************************************************************/
QgsStringStatisticalSummary::QgsStringStatisticalSummary( const QgsStringStatisticalSummary::Statistics& stats )
: mStatistics( stats )
{
reset();
}
void QgsStringStatisticalSummary::reset()
{
mCount = 0;
mValues.clear();
mCountMissing = 0;
mMin.clear();
mMax.clear();
mMinLength = INT_MAX;
mMaxLength = 0;
}
void QgsStringStatisticalSummary::calculate( const QStringList& values )
{
reset();
Q_FOREACH ( const QString& string, values )
{
testString( string );
}
}
void QgsStringStatisticalSummary::calculate( const QVariantList& values )
{
reset();
Q_FOREACH ( const QVariant& variant, values )
{
if ( variant.type() == QVariant::String )
{
testString( variant.toString() );
}
}
}
void QgsStringStatisticalSummary::testString( const QString& string )
{
mCount++;
if ( string.isEmpty() )
mCountMissing++;
if ( mStatistics & CountDistinct )
{
mValues << string;
}
if ( mStatistics & Min )
{
if ( !mMin.isEmpty() && !string.isEmpty() )
{
mMin = qMin( mMin, string );
}
else if ( mMin.isEmpty() && !string.isEmpty() )
{
mMin = string;
}
}
if ( mStatistics & Max )
{
if ( !mMax.isEmpty() && !string.isEmpty() )
{
mMax = qMax( mMax, string );
}
else if ( mMax.isEmpty() && !string.isEmpty() )
{
mMax = string;
}
}
mMinLength = qMin( mMinLength, string.length() );
mMaxLength = qMax( mMaxLength, string.length() );
}
QVariant QgsStringStatisticalSummary::statistic( QgsStringStatisticalSummary::Statistic stat ) const
{
switch ( stat )
{
case Count:
return mCount;
case CountDistinct:
return mValues.count();
case CountMissing:
return mCountMissing;
case Min:
return mMin;
case Max:
return mMax;
case MinimumLength:
return mMinLength;
case MaximumLength:
return mMaxLength;
case All:
return 0;
}
return 0;
}
QString QgsStringStatisticalSummary::displayName( QgsStringStatisticalSummary::Statistic statistic )
{
switch ( statistic )
{
case Count:
return QObject::tr( "Count" );
case CountDistinct:
return QObject::tr( "Count (distinct)" );
case CountMissing:
return QObject::tr( "Count (missing)" );
case Min:
return QObject::tr( "Minimum" );
case Max:
return QObject::tr( "Maximum" );
case MinimumLength:
return QObject::tr( "Minimum length" );
case MaximumLength:
return QObject::tr( "Maximum length" );
case All:
return QString();
}
return QString();
}

View File

@ -0,0 +1,153 @@
/***************************************************************************
qgsstringstatisticalsummary.h
-----------------------------
Date : May 2016
Copyright : (C) 2016 by Nyall Dawson
Email : nyall dot dawson at gmail dot com
***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
#ifndef QGSSTRINGSTATISTICALSUMMARY_H
#define QGSSTRINGSTATISTICALSUMMARY_H
#include <QSet>
#include <QVariantList>
/***************************************************************************
* This class is considered CRITICAL and any change MUST be accompanied with
* full unit tests in test_qgsstringstatisticalsummary.py.
* See details in QEP #17
****************************************************************************/
/** \ingroup core
* \class QgsStringStatisticalSummary
* \brief Calculator for summary statistics and aggregates for a list of strings.
*
* Statistics are calculated by calling @link calculate @endlink and passing a list of strings. The
* individual statistics can then be retrieved using the associated methods. Note that not all statistics
* are calculated by default. Statistics which require slower computations are only calculated by
* specifying the statistic in the constructor or via @link setStatistics @endlink.
*
* \note Added in version 2.16
*/
class CORE_EXPORT QgsStringStatisticalSummary
{
public:
//! Enumeration of flags that specify statistics to be calculated
enum Statistic
{
Count = 1, //!< Count
CountDistinct = 2, //!< Number of distinct string values
CountMissing = 4, //!< Number of missing (null) values
Min = 8, //!< Minimum string value
Max = 16, //!< Maximum string value
MinimumLength = 32, //!< Minimum length of string
MaximumLength = 64, //!< Maximum length of string
All = Count | CountDistinct | CountMissing | Min | Max, //! All statistics
};
Q_DECLARE_FLAGS( Statistics, Statistic )
/** Constructor for QgsStringStatistics
* @param stats flags for statistics to calculate
*/
QgsStringStatisticalSummary( const QgsStringStatisticalSummary::Statistics& stats = All );
/** Returns flags which specify which statistics will be calculated. Some statistics
* are always calculated (eg count).
* @see setStatistics
*/
Statistics statistics() const { return mStatistics; }
/** Sets flags which specify which statistics will be calculated. Some statistics
* are always calculated (eg count).
* @param stats flags for statistics to calculate
* @see statistics
*/
void setStatistics( const Statistics& stats ) { mStatistics = stats; }
/** Resets the calculated values
*/
void reset();
/** Calculates summary statistics for a list of strings.
* @param values list of strings
*/
void calculate( const QStringList& values );
/** Calculates summary statistics for a list of variants. Any non-string variants will be
* ignored.
* @param values list of variants
*/
void calculate( const QVariantList& values );
/** Returns the value of a specified statistic
* @param stat statistic to return
* @returns calculated value of statistic
*/
QVariant statistic( Statistic stat ) const;
/** Returns the calculated count of values.
*/
int count() const { return mCount; }
/** Returns the number of distinct string values.
* @see distinctValues()
*/
int countDistinct() const { return mValues.count(); }
/** Returns the set of distinct string values.
* @see countDistinct()
*/
QSet< QString > distinctValues() const { return mValues; }
/** Returns the number of missing (null) string values.
*/
int countMissing() const { return mCountMissing; }
/** Returns the minimum (non-null) string value.
*/
QString min() const { return mMin; }
/** Returns the maximum (non-null) string value.
*/
QString max() const { return mMax; }
/** Returns the minimum length of strings.
*/
int minLength() const { return mMinLength; }
/** Returns the maximum length of strings.
*/
int maxLength() const { return mMaxLength; }
/** Returns the friendly display name for a statistic
* @param statistic statistic to return name for
*/
static QString displayName( Statistic statistic );
private:
Statistics mStatistics;
int mCount;
QSet< QString > mValues;
int mCountMissing;
QString mMin;
QString mMax;
int mMinLength;
int mMaxLength;
void testString( const QString& string );
};
Q_DECLARE_OPERATORS_FOR_FLAGS( QgsStringStatisticalSummary::Statistics )
#endif // QGSSTRINGSTATISTICALSUMMARY_H

View File

@ -66,6 +66,7 @@ ADD_PYTHON_TEST(PyQgsShapefileProvider test_provider_shapefile.py)
ADD_PYTHON_TEST(PyQgsTabfileProvider test_provider_tabfile.py)
ADD_PYTHON_TEST(PyQgsSpatialIndex test_qgsspatialindex.py)
ADD_PYTHON_TEST(PyQgsSpatialiteProvider test_provider_spatialite.py)
ADD_PYTHON_TEST(PyQgsStringStatisticalSummary test_qgsstringstatisticalsummary.py)
ADD_PYTHON_TEST(PyQgsSymbolLayerV2 test_qgssymbollayerv2.py)
ADD_PYTHON_TEST(PyQgsArrowSymbolLayer test_qgsarrowsymbollayer.py)
ADD_PYTHON_TEST(PyQgsSymbolExpressionVariables test_qgssymbolexpressionvariables.py)

View File

@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""QGIS Unit tests for QgsStringStatisticalSummary.
.. note:: This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
"""
__author__ = 'Nyall Dawson'
__date__ = '07/05/2016'
__copyright__ = 'Copyright 2016, The QGIS Project'
# This will get replaced with a git SHA1 when you do a git archive
__revision__ = '$Format:%H$'
import qgis # NOQA
from qgis.core import (QgsStringStatisticalSummary
)
from qgis.testing import unittest
class PyQgsStringStatisticalSummary(unittest.TestCase):
def testStats(self):
s = QgsStringStatisticalSummary()
self.assertEqual(s.statistics(), QgsStringStatisticalSummary.All)
s.calculate(['cc', 'aaaa', 'bbbbbbbb', 'aaaa', 'eeee', '', 'eeee', '', 'dddd'])
self.assertEqual(s.count(), 9)
self.assertEqual(s.countDistinct(), 6)
self.assertEqual(set(s.distinctValues()), set(['cc', 'aaaa', 'bbbbbbbb', 'eeee', 'dddd', '']))
self.assertEqual(s.countMissing(), 2)
self.assertEqual(s.min(), 'aaaa')
self.assertEqual(s.max(), 'eeee')
self.assertEqual(s.minLength(), 0)
self.assertEqual(s.maxLength(), 8)
#extra check for minLength without empty strings
s.calculate(['1111111', '111', '11111'])
self.assertEqual(s.minLength(), 3)
def testIndividualStats(self):
# tests calculation of statistics one at a time, to make sure statistic calculations are not
# dependent on each other
tests = [{'stat': QgsStringStatisticalSummary.Count, 'expected': 9},
{'stat': QgsStringStatisticalSummary.CountDistinct, 'expected': 6},
{'stat': QgsStringStatisticalSummary.CountMissing, 'expected': 2},
{'stat': QgsStringStatisticalSummary.Min, 'expected': 'aaaa'},
{'stat': QgsStringStatisticalSummary.Max, 'expected': 'eeee'},
{'stat': QgsStringStatisticalSummary.MinimumLength, 'expected': 0},
{'stat': QgsStringStatisticalSummary.MaximumLength, 'expected': 8},
]
s = QgsStringStatisticalSummary()
for t in tests:
# test constructor
s2 = QgsStringStatisticalSummary(t['stat'])
self.assertEqual(s2.statistics(), t['stat'])
s.setStatistics(t['stat'])
self.assertEqual(s.statistics(), t['stat'])
s.calculate(['cc', 'aaaa', 'bbbbbbbb', 'aaaa', 'eeee', '', 'eeee', '', 'dddd'])
self.assertEqual(s.statistic(t['stat']), t['expected'])
# display name
self.assertTrue(len(QgsStringStatisticalSummary.displayName(t['stat'])) > 0)
def testVariantStats(self):
s = QgsStringStatisticalSummary()
self.assertEqual(s.statistics(), QgsStringStatisticalSummary.All)
s.calculate(['cc', 5, 'bbbb', 'aaaa', 'eeee', 6, 9, '9', ''])
self.assertEqual(s.count(), 6)
self.assertEqual(set(s.distinctValues()), set(['cc', 'aaaa', 'bbbb', 'eeee', '', '9']))
self.assertEqual(s.countMissing(), 1)
self.assertEqual(s.min(), '9')
self.assertEqual(s.max(), 'eeee')
if __name__ == '__main__':
unittest.main()