[processing] Fix inefficiencies in Delete Duplicate Geometries algorithm

..and make progress bar more accurate.

Use a spatial index to avoid comparing every feature to every other
feature, and only compare against features with intersecting bounding
boxes instead. Also optimise feature requests and loop logic.

Benchmarks:

Point layer, 6000k features

Before: 30 seconds
After: 0.15 seconds

Point layer, 45k features

Before: > 10 minutes
After: 7 seconds

Fixes #19973
This commit is contained in:
Nyall Dawson 2018-09-28 13:44:27 +10:00
parent 6110931f8a
commit 9698444f4a

View File

@ -70,6 +70,7 @@ class DeleteDuplicateGeometries(QgisAlgorithm):
raise QgsProcessingException(self.invalidSinkError(parameters, self.OUTPUT)) raise QgsProcessingException(self.invalidSinkError(parameters, self.OUTPUT))
features = source.getFeatures(QgsFeatureRequest().setSubsetOfAttributes([])) features = source.getFeatures(QgsFeatureRequest().setSubsetOfAttributes([]))
total = 100.0 / source.featureCount() if source.featureCount() else 0 total = 100.0 / source.featureCount() if source.featureCount() else 0
geoms = dict() geoms = dict()
index = QgsSpatialIndex() index = QgsSpatialIndex()
@ -78,28 +79,52 @@ class DeleteDuplicateGeometries(QgisAlgorithm):
break break
geoms[f.id()] = f.geometry() geoms[f.id()] = f.geometry()
#index.insertFeature index.addFeature(f)
feedback.setProgress(int(current * total))
cleaned = dict(geoms) feedback.setProgress(int(0.10 * current * total)) # takes about 10% of time
for i, g in list(geoms.items()): # start by assuming everything is unique, and chop away at this list
unique_features = dict(geoms)
current = 0
for feature_id, geometry in geoms.items():
if feedback.isCanceled(): if feedback.isCanceled():
break break
for j in list(cleaned.keys()): if feature_id not in unique_features:
if i == j or i not in cleaned: # feature was already marked as a duplicate
continue continue
if g.isGeosEqual(cleaned[j]):
del cleaned[j]
total = 100.0 / len(cleaned) if cleaned else 1 candidates = index.intersects(geometry.boundingBox())
request = QgsFeatureRequest().setFilterFids(list(cleaned.keys())) candidates.remove(feature_id)
for candidate_id in candidates:
if candidate_id not in unique_features:
# candidate already marked as a duplicate (not sure if this is possible,
# since it would mean the current feature would also have to be a duplicate!
# but let's be safe!)
continue
if geometry.isGeosEqual(geoms[candidate_id]):
# candidate is a duplicate of feature
del unique_features[candidate_id]
current += 1
feedback.setProgress(int(0.80 * current * total) + 10) # takes about 80% of time
total = 100.0 / len(unique_features) if unique_features else 1
# now, fetch all the feature attributes for the unique features only
# be super-smart and don't re-fetch geometries
request = QgsFeatureRequest().setFilterFids(list(unique_features.keys())).setFlags(QgsFeatureRequest.NoGeometry)
for current, f in enumerate(source.getFeatures(request)): for current, f in enumerate(source.getFeatures(request)):
if feedback.isCanceled(): if feedback.isCanceled():
break break
# use already fetched geometry
f.setGeometry(unique_features[f.id()])
sink.addFeature(f, QgsFeatureSink.FastInsert) sink.addFeature(f, QgsFeatureSink.FastInsert)
feedback.setProgress(int(current * total))
feedback.setProgress(int(0.10 * current * total) + 90) # takes about 10% of time
return {self.OUTPUT: dest_id} return {self.OUTPUT: dest_id}