From cfade144d9b03c0760556e727d69ee537a0b3b1c Mon Sep 17 00:00:00 2001 From: Adam Jones Date: Sun, 17 Mar 2019 05:51:45 +0000 Subject: [PATCH] Refactor string sorting utils to remove dependency (#2623) Also adds Javadocs, and simplifies calculateSimilarity --- app/build.gradle | 1 - .../nrw/commons/utils/StringSortingUtils.java | 82 ++++++++++++++----- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/app/build.gradle b/app/build.gradle index edb98dbb3..c032e5520 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -17,7 +17,6 @@ dependencies { implementation 'com.github.nicolas-raoul:Quadtree:ac16ea8035bf07' implementation 'commons-codec:commons-codec:1.10' implementation 'com.google.code.gson:gson:2.8.5' - implementation 'info.debatty:java-string-similarity:0.24' implementation 'in.yuvi:http.fluent:1.3' implementation 'com.squareup.okhttp3:okhttp:3.12.1' implementation 'com.squareup.okio:okio:1.15.0' diff --git a/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java b/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java index 1b321ec07..cbfb93994 100644 --- a/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java +++ b/app/src/main/java/fr/free/nrw/commons/utils/StringSortingUtils.java @@ -1,9 +1,6 @@ package fr.free.nrw.commons.utils; import java.util.Comparator; -import java.util.Locale; - -import info.debatty.java.stringsimilarity.Levenshtein; public class StringSortingUtils { @@ -12,14 +9,13 @@ public class StringSortingUtils { } /** - * Returns Comparator for sorting strings by its similarity with Levenshtein - * algorithm. By using this Comparator we get results from the highest to - * the lowest match. + * Returns Comparator for sorting strings by their similarity to the filter. + * By using this Comparator we get results + * from the highest to the lowest similarity with the filter. * - * @param filter pattern to compare similarity + * @param filter String to compare similarity with * @return Comparator with string similarity */ - public static Comparator sortBySimilarity(final String filter) { return (firstItem, secondItem) -> { double firstItemSimilarity = calculateSimilarity(firstItem, filter); @@ -28,20 +24,66 @@ public class StringSortingUtils { }; } - private static double calculateSimilarity(String firstString, String secondString) { - String longer = firstString.toLowerCase(Locale.getDefault()); - String shorter = secondString.toLowerCase(Locale.getDefault()); - if (firstString.length() < secondString.length()) { - longer = secondString; - shorter = firstString; - } - int longerLength = longer.length(); - if (longerLength == 0) { - return 1.0; - } + /** + * Determines String similarity between str1 and str2 on scale from 0.0 to 1.0 + * @param str1 String 1 + * @param str2 String 2 + * @return Double between 0.0 and 1.0 that reflects string similarity + */ + private static double calculateSimilarity(String str1, String str2) { + int longerLength = Math.max(str1.length(), str2.length()); - double distanceBetweenStrings = new Levenshtein().distance(longer, shorter); + if (longerLength == 0) return 1.0; + + int distanceBetweenStrings = levenshteinDistance(str1, str2); return (longerLength - distanceBetweenStrings) / (double) longerLength; } + + /** + * Levershtein distance algorithm + * https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java + * + * @param str1 String 1 + * @param str2 String 2 + * @return Number of characters the strings differ by + */ + private static int levenshteinDistance(String str1, String str2) { + if (str1.equals(str2)) return 0; + if (str1.length() == 0) return str2.length(); + if (str2.length() == 0) return str1.length(); + + int[] cost = new int[str1.length() + 1]; + int[] newcost = new int[str1.length() + 1]; + + // initial cost of skipping prefix in str1 + for (int i = 0; i < cost.length; i++) cost[i] = i; + + // transformation cost for each letter in str2 + for (int j = 1; j <= str2.length(); j++) { + // initial cost of skipping prefix in String str2 + newcost[0] = j; + + // transformation cost for each letter in str1 + for(int i = 1; i < cost.length; i++) { + // matching current letters in both strings + int match = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1; + + // computing cost for each transformation + int cost_replace = cost[i - 1] + match; + int cost_insert = cost[i] + 1; + int cost_delete = newcost[i - 1] + 1; + + // keep minimum cost + newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace); + } + + int[] tmp = cost; + cost = newcost; + newcost = tmp; + } + + // the distance is the cost for transforming all letters in both strings + return cost[str1.length()]; + } }