Refactor string sorting utils to remove dependency (#2623)

Also adds Javadocs, and simplifies calculateSimilarity
2025-10-26 12:23:58 +01:00 · 2019-03-17 05:51:45 +00:00 · 2019-03-17 05:51:45 +00:00 · cfade144d9
commit cfade144d9
parent 701764d974
2 changed files with 62 additions and 21 deletions
--- a/app/build.gradle
+++ b/app/build.gradle
@ -17,7 +17,6 @@ dependencies {
    implementation 'com.github.nicolas-raoul:Quadtree:ac16ea8035bf07'
    implementation 'commons-codec:commons-codec:1.10'
    implementation 'com.google.code.gson:gson:2.8.5'
-    implementation 'info.debatty:java-string-similarity:0.24'
    implementation 'in.yuvi:http.fluent:1.3'
    implementation 'com.squareup.okhttp3:okhttp:3.12.1'
    implementation 'com.squareup.okio:okio:1.15.0'
--- a/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java
+++ b/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java
@ -1,9 +1,6 @@
 package fr.free.nrw.commons.utils;

 import java.util.Comparator;
-import java.util.Locale;
-
-import info.debatty.java.stringsimilarity.Levenshtein;

 public class StringSortingUtils {

@ -12,14 +9,13 @@ public class StringSortingUtils {
    }

    /**
-     * Returns Comparator for sorting strings by its similarity with Levenshtein
-     * algorithm. By using this Comparator we get results from the highest to
-     * the lowest match.
+     * Returns Comparator for sorting strings by their similarity to the filter.
+     * By using this Comparator we get results
+     * from the highest to the lowest similarity with the filter.
     *
-     * @param filter pattern to compare similarity
+     * @param filter String to compare similarity with
     * @return Comparator with string similarity
     */
-
    public static Comparator<String> sortBySimilarity(final String filter) {
        return (firstItem, secondItem) -> {
            double firstItemSimilarity = calculateSimilarity(firstItem, filter);
@ -28,20 +24,66 @@ public class StringSortingUtils {
        };
    }

-    private static double calculateSimilarity(String firstString, String secondString) {
-        String longer = firstString.toLowerCase(Locale.getDefault());
-        String shorter = secondString.toLowerCase(Locale.getDefault());

-        if (firstString.length() < secondString.length()) {
-            longer = secondString;
-            shorter = firstString;
-        }
-        int longerLength = longer.length();
-        if (longerLength == 0) {
-            return 1.0;
-        }
+    /**
+     * Determines String similarity between str1 and str2 on scale from 0.0 to 1.0
+     * @param str1 String 1
+     * @param str2 String 2
+     * @return Double between 0.0 and 1.0 that reflects string similarity
+     */
+    private static double calculateSimilarity(String str1, String str2) {
+        int longerLength = Math.max(str1.length(), str2.length());

-        double distanceBetweenStrings = new Levenshtein().distance(longer, shorter);
+        if (longerLength == 0) return 1.0;
+
+        int distanceBetweenStrings = levenshteinDistance(str1, str2);
        return (longerLength - distanceBetweenStrings) / (double) longerLength;
    }
+
+    /**
+     * Levershtein distance algorithm
+     * https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java
+     *
+     * @param str1 String 1
+     * @param str2 String 2
+     * @return Number of characters the strings differ by
+     */
+    private static int levenshteinDistance(String str1, String str2) {
+        if (str1.equals(str2)) return 0;
+        if (str1.length() == 0) return str2.length();
+        if (str2.length() == 0) return str1.length();
+
+        int[] cost = new int[str1.length() + 1];
+        int[] newcost = new int[str1.length() + 1];
+
+        // initial cost of skipping prefix in str1
+        for (int i = 0; i < cost.length; i++) cost[i] = i;
+
+        // transformation cost for each letter in str2
+        for (int j = 1; j <= str2.length(); j++) {
+            // initial cost of skipping prefix in String str2
+            newcost[0] = j;
+
+            // transformation cost for each letter in str1
+            for(int i = 1; i < cost.length; i++) {
+                // matching current letters in both strings
+                int match = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
+
+                // computing cost for each transformation
+                int cost_replace = cost[i - 1] + match;
+                int cost_insert  = cost[i] + 1;
+                int cost_delete  = newcost[i - 1] + 1;
+
+                // keep minimum cost
+                newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
+            }
+
+            int[] tmp = cost;
+            cost = newcost;
+            newcost = tmp;
+        }
+
+        // the distance is the cost for transforming all letters in both strings
+        return cost[str1.length()];
+    }
 }