[processing] Fix inefficiencies in Delete Duplicate Geometries algorithm

..and make progress bar more accurate. Use a spatial index to avoid comparing every feature to every other feature, and only compare against features with intersecting bounding boxes instead. Also optimise feature requests and loop logic. Benchmarks: Point layer, 6000k features Before: 30 seconds After: 0.15 seconds Point layer, 45k features Before: > 10 minutes After: 7 seconds Fixes #19973
2025-07-01 00:02:20 -04:00 · 2018-09-28 13:44:27 +10:00 · 2018-09-28 13:44:27 +10:00 · 9698444f4a
commit 9698444f4a
parent 6110931f8a
1 changed files with 37 additions and 12 deletions
--- a/python/plugins/processing/algs/qgis/DeleteDuplicateGeometries.py
+++ b/python/plugins/processing/algs/qgis/DeleteDuplicateGeometries.py
@ -70,6 +70,7 @@ class DeleteDuplicateGeometries(QgisAlgorithm):
            raise QgsProcessingException(self.invalidSinkError(parameters, self.OUTPUT))
        features = source.getFeatures(QgsFeatureRequest().setSubsetOfAttributes([]))
        total = 100.0 / source.featureCount() if source.featureCount() else 0
        geoms = dict()
        index = QgsSpatialIndex()
@ -78,28 +79,52 @@ class DeleteDuplicateGeometries(QgisAlgorithm):
                break
            geoms[f.id()] = f.geometry()
-            #index.insertFeature
+            index.addFeature(f)
            feedback.setProgress(int(current * total))
-        cleaned = dict(geoms)
+            feedback.setProgress(int(0.10 * current * total)) # takes about 10% of time
-        for i, g in list(geoms.items()):
+        # start by assuming everything is unique, and chop away at this list
        unique_features = dict(geoms)
        current = 0
        for feature_id, geometry in geoms.items():
            if feedback.isCanceled():
                break
-            for j in list(cleaned.keys()):
+            if feature_id not in unique_features:
-                if i == j or i not in cleaned:
+                # feature was already marked as a duplicate
-                    continue
+                continue
                if g.isGeosEqual(cleaned[j]):
                    del cleaned[j]
-        total = 100.0 / len(cleaned) if cleaned else 1
+            candidates = index.intersects(geometry.boundingBox())
-        request = QgsFeatureRequest().setFilterFids(list(cleaned.keys()))
+            candidates.remove(feature_id)
            for candidate_id in candidates:
                if candidate_id not in unique_features:
                    # candidate already marked as a duplicate (not sure if this is possible,
                    # since it would mean the current feature would also have to be a duplicate!
                    # but let's be safe!)
                    continue
                if geometry.isGeosEqual(geoms[candidate_id]):
                    # candidate is a duplicate of feature
                    del unique_features[candidate_id]
            current += 1
            feedback.setProgress(int(0.80 * current * total) + 10)  # takes about 80% of time
        total = 100.0 / len(unique_features) if unique_features else 1
        # now, fetch all the feature attributes for the unique features only
        # be super-smart and don't re-fetch geometries
        request = QgsFeatureRequest().setFilterFids(list(unique_features.keys())).setFlags(QgsFeatureRequest.NoGeometry)
        for current, f in enumerate(source.getFeatures(request)):
            if feedback.isCanceled():
                break
            # use already fetched geometry
            f.setGeometry(unique_features[f.id()])
            sink.addFeature(f, QgsFeatureSink.FastInsert)
-            feedback.setProgress(int(current * total))
+
            feedback.setProgress(int(0.10 * current * total) + 90) # takes about 10% of time
        return {self.OUTPUT: dest_id}