mirror of
https://github.com/commons-app/apps-android-commons.git
synced 2025-10-26 12:23:58 +01:00
Refactor string sorting utils to remove dependency (#2623)
Also adds Javadocs, and simplifies calculateSimilarity
This commit is contained in:
parent
701764d974
commit
cfade144d9
2 changed files with 62 additions and 21 deletions
|
|
@ -17,7 +17,6 @@ dependencies {
|
|||
implementation 'com.github.nicolas-raoul:Quadtree:ac16ea8035bf07'
|
||||
implementation 'commons-codec:commons-codec:1.10'
|
||||
implementation 'com.google.code.gson:gson:2.8.5'
|
||||
implementation 'info.debatty:java-string-similarity:0.24'
|
||||
implementation 'in.yuvi:http.fluent:1.3'
|
||||
implementation 'com.squareup.okhttp3:okhttp:3.12.1'
|
||||
implementation 'com.squareup.okio:okio:1.15.0'
|
||||
|
|
|
|||
|
|
@ -1,9 +1,6 @@
|
|||
package fr.free.nrw.commons.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Locale;
|
||||
|
||||
import info.debatty.java.stringsimilarity.Levenshtein;
|
||||
|
||||
public class StringSortingUtils {
|
||||
|
||||
|
|
@ -12,14 +9,13 @@ public class StringSortingUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns Comparator for sorting strings by its similarity with Levenshtein
|
||||
* algorithm. By using this Comparator we get results from the highest to
|
||||
* the lowest match.
|
||||
* Returns Comparator for sorting strings by their similarity to the filter.
|
||||
* By using this Comparator we get results
|
||||
* from the highest to the lowest similarity with the filter.
|
||||
*
|
||||
* @param filter pattern to compare similarity
|
||||
* @param filter String to compare similarity with
|
||||
* @return Comparator with string similarity
|
||||
*/
|
||||
|
||||
public static Comparator<String> sortBySimilarity(final String filter) {
|
||||
return (firstItem, secondItem) -> {
|
||||
double firstItemSimilarity = calculateSimilarity(firstItem, filter);
|
||||
|
|
@ -28,20 +24,66 @@ public class StringSortingUtils {
|
|||
};
|
||||
}
|
||||
|
||||
private static double calculateSimilarity(String firstString, String secondString) {
|
||||
String longer = firstString.toLowerCase(Locale.getDefault());
|
||||
String shorter = secondString.toLowerCase(Locale.getDefault());
|
||||
|
||||
if (firstString.length() < secondString.length()) {
|
||||
longer = secondString;
|
||||
shorter = firstString;
|
||||
}
|
||||
int longerLength = longer.length();
|
||||
if (longerLength == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
/**
|
||||
* Determines String similarity between str1 and str2 on scale from 0.0 to 1.0
|
||||
* @param str1 String 1
|
||||
* @param str2 String 2
|
||||
* @return Double between 0.0 and 1.0 that reflects string similarity
|
||||
*/
|
||||
private static double calculateSimilarity(String str1, String str2) {
|
||||
int longerLength = Math.max(str1.length(), str2.length());
|
||||
|
||||
double distanceBetweenStrings = new Levenshtein().distance(longer, shorter);
|
||||
if (longerLength == 0) return 1.0;
|
||||
|
||||
int distanceBetweenStrings = levenshteinDistance(str1, str2);
|
||||
return (longerLength - distanceBetweenStrings) / (double) longerLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Levershtein distance algorithm
|
||||
* https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java
|
||||
*
|
||||
* @param str1 String 1
|
||||
* @param str2 String 2
|
||||
* @return Number of characters the strings differ by
|
||||
*/
|
||||
private static int levenshteinDistance(String str1, String str2) {
|
||||
if (str1.equals(str2)) return 0;
|
||||
if (str1.length() == 0) return str2.length();
|
||||
if (str2.length() == 0) return str1.length();
|
||||
|
||||
int[] cost = new int[str1.length() + 1];
|
||||
int[] newcost = new int[str1.length() + 1];
|
||||
|
||||
// initial cost of skipping prefix in str1
|
||||
for (int i = 0; i < cost.length; i++) cost[i] = i;
|
||||
|
||||
// transformation cost for each letter in str2
|
||||
for (int j = 1; j <= str2.length(); j++) {
|
||||
// initial cost of skipping prefix in String str2
|
||||
newcost[0] = j;
|
||||
|
||||
// transformation cost for each letter in str1
|
||||
for(int i = 1; i < cost.length; i++) {
|
||||
// matching current letters in both strings
|
||||
int match = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
|
||||
|
||||
// computing cost for each transformation
|
||||
int cost_replace = cost[i - 1] + match;
|
||||
int cost_insert = cost[i] + 1;
|
||||
int cost_delete = newcost[i - 1] + 1;
|
||||
|
||||
// keep minimum cost
|
||||
newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
|
||||
}
|
||||
|
||||
int[] tmp = cost;
|
||||
cost = newcost;
|
||||
newcost = tmp;
|
||||
}
|
||||
|
||||
// the distance is the cost for transforming all letters in both strings
|
||||
return cost[str1.length()];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue