mirror of
https://github.com/commons-app/apps-android-commons.git
synced 2025-10-26 20:33:53 +01:00
enhance spammy category filter (#6167)
Signed-off-by: parneet-guraya <gurayaparneet@gmail.com>
This commit is contained in:
parent
e653857437
commit
7566ddf529
3 changed files with 61 additions and 24 deletions
0
app/.attach_pid781771
Normal file
0
app/.attach_pid781771
Normal file
|
|
@ -36,37 +36,35 @@ class CategoriesModel
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
fun isSpammyCategory(item: String): Boolean {
|
fun isSpammyCategory(item: String): Boolean {
|
||||||
// Check for current and previous year to exclude these categories from removal
|
|
||||||
val now = Calendar.getInstance()
|
|
||||||
val curYear = now[Calendar.YEAR]
|
|
||||||
val curYearInString = curYear.toString()
|
|
||||||
val prevYear = curYear - 1
|
|
||||||
val prevYearInString = prevYear.toString()
|
|
||||||
Timber.d("Previous year: %s", prevYearInString)
|
|
||||||
|
|
||||||
val mentionsDecade = item.matches(".*0s.*".toRegex())
|
|
||||||
val recentDecade = item.matches(".*20[0-2]0s.*".toRegex())
|
|
||||||
val spammyCategory =
|
|
||||||
item.matches("(.*)needing(.*)".toRegex()) ||
|
|
||||||
item.matches("(.*)taken on(.*)".toRegex())
|
|
||||||
|
|
||||||
// always skip irrelevant categories such as Media_needing_categories_as_of_16_June_2017(Issue #750)
|
// always skip irrelevant categories such as Media_needing_categories_as_of_16_June_2017(Issue #750)
|
||||||
|
val spammyCategory = item.matches("(.*)needing(.*)".toRegex())
|
||||||
|
|| item.matches("(.*)taken on(.*)".toRegex())
|
||||||
|
|
||||||
|
// checks for
|
||||||
|
// dd/mm/yyyy or yy
|
||||||
|
// yyyy or yy/mm/dd
|
||||||
|
// yyyy or yy/mm
|
||||||
|
// mm/yyyy or yy
|
||||||
|
// for `yy` it is assumed that 20XX is implicit.
|
||||||
|
// with separators [., /, -]
|
||||||
|
val isIrrelevantCategory =
|
||||||
|
item.contains("""\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|\d{2,4}[-/.]\d{1,2}[-/.]\d{1,2}|\d{2,4}[-/.]\d{1,2}|\d{1,2}[-/.]\d{2,4}""".toRegex())
|
||||||
|
|
||||||
|
|
||||||
if (spammyCategory) {
|
if (spammyCategory) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mentionsDecade) {
|
if(isIrrelevantCategory){
|
||||||
// Check if the year in the form of XX(X)0s is recent/relevant, i.e. in the 2000s or 2010s/2020s as stated in Issue #1029
|
return true
|
||||||
// Example: "2020s" is OK, but "1920s" is not (and should be skipped)
|
|
||||||
return !recentDecade
|
|
||||||
} else {
|
|
||||||
// If it is not an year in decade form (e.g. 19xxs/20xxs), then check if item contains a 4-digit year
|
|
||||||
// anywhere within the string (.* is wildcard) (Issue #47)
|
|
||||||
// And that item does not equal the current year or previous year
|
|
||||||
return item.matches(".*(19|20)\\d{2}.*".toRegex()) &&
|
|
||||||
!item.contains(curYearInString) &&
|
|
||||||
!item.contains(prevYearInString)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val hasYear = item.matches("(.*\\d{4}.*)".toRegex())
|
||||||
|
val validYearsRange = item.matches(".*(20[0-9]{2}).*".toRegex())
|
||||||
|
|
||||||
|
// finally if there's 4 digits year exists in XXXX it should only be in 20XX range.
|
||||||
|
return hasYear && !validYearsRange
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import fr.free.nrw.commons.upload.GpsCategoryModel
|
||||||
import io.reactivex.Single
|
import io.reactivex.Single
|
||||||
import io.reactivex.subjects.BehaviorSubject
|
import io.reactivex.subjects.BehaviorSubject
|
||||||
import media
|
import media
|
||||||
|
import org.junit.Assert
|
||||||
import org.junit.Before
|
import org.junit.Before
|
||||||
import org.junit.Test
|
import org.junit.Test
|
||||||
import org.mockito.ArgumentMatchers
|
import org.mockito.ArgumentMatchers
|
||||||
|
|
@ -331,4 +332,42 @@ class CategoriesModelTest {
|
||||||
media(),
|
media(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test valid input with XXXX in it between the expected range 20XX`() {
|
||||||
|
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014)")
|
||||||
|
Assert.assertFalse(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test valid input with XXXXs in it between the expected range 20XXs`() {
|
||||||
|
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014s)")
|
||||||
|
Assert.assertFalse(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test invalid category when have needing in the input`() {
|
||||||
|
val input = categoriesModel.isSpammyCategory("Media needing categories as of 30 March 2017")
|
||||||
|
Assert.assertTrue(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test invalid category when have taken on in the input`() {
|
||||||
|
val input = categoriesModel.isSpammyCategory("Photographs taken on 2015-12-08")
|
||||||
|
Assert.assertTrue(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test invalid category when have yy mm or yy mm dd in the input`() {
|
||||||
|
// filtering based on [., /, -] separators between the dates.
|
||||||
|
val input = categoriesModel.isSpammyCategory("Image class 09.14")
|
||||||
|
Assert.assertTrue(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test invalid category when have years not in 20XX range`() {
|
||||||
|
val input = categoriesModel.isSpammyCategory("Japan in the 1400s")
|
||||||
|
Assert.assertTrue(input)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue