enhance spammy category filter (#6167)

Signed-off-by: parneet-guraya <gurayaparneet@gmail.com>
This commit is contained in:
Parneet Singh 2025-02-01 05:43:17 +05:30 committed by GitHub
parent e653857437
commit 7566ddf529
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 61 additions and 24 deletions

0
app/.attach_pid781771 Normal file
View file

View file

@ -36,37 +36,35 @@ class CategoriesModel
* @return
*/
fun isSpammyCategory(item: String): Boolean {
// Check for current and previous year to exclude these categories from removal
val now = Calendar.getInstance()
val curYear = now[Calendar.YEAR]
val curYearInString = curYear.toString()
val prevYear = curYear - 1
val prevYearInString = prevYear.toString()
Timber.d("Previous year: %s", prevYearInString)
val mentionsDecade = item.matches(".*0s.*".toRegex())
val recentDecade = item.matches(".*20[0-2]0s.*".toRegex())
val spammyCategory =
item.matches("(.*)needing(.*)".toRegex()) ||
item.matches("(.*)taken on(.*)".toRegex())
// always skip irrelevant categories such as Media_needing_categories_as_of_16_June_2017(Issue #750)
val spammyCategory = item.matches("(.*)needing(.*)".toRegex())
|| item.matches("(.*)taken on(.*)".toRegex())
// checks for
// dd/mm/yyyy or yy
// yyyy or yy/mm/dd
// yyyy or yy/mm
// mm/yyyy or yy
// for `yy` it is assumed that 20XX is implicit.
// with separators [., /, -]
val isIrrelevantCategory =
item.contains("""\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|\d{2,4}[-/.]\d{1,2}[-/.]\d{1,2}|\d{2,4}[-/.]\d{1,2}|\d{1,2}[-/.]\d{2,4}""".toRegex())
if (spammyCategory) {
return true
}
if (mentionsDecade) {
// Check if the year in the form of XX(X)0s is recent/relevant, i.e. in the 2000s or 2010s/2020s as stated in Issue #1029
// Example: "2020s" is OK, but "1920s" is not (and should be skipped)
return !recentDecade
} else {
// If it is not an year in decade form (e.g. 19xxs/20xxs), then check if item contains a 4-digit year
// anywhere within the string (.* is wildcard) (Issue #47)
// And that item does not equal the current year or previous year
return item.matches(".*(19|20)\\d{2}.*".toRegex()) &&
!item.contains(curYearInString) &&
!item.contains(prevYearInString)
if(isIrrelevantCategory){
return true
}
val hasYear = item.matches("(.*\\d{4}.*)".toRegex())
val validYearsRange = item.matches(".*(20[0-9]{2}).*".toRegex())
// finally if there's 4 digits year exists in XXXX it should only be in 20XX range.
return hasYear && !validYearsRange
}
/**

View file

@ -11,6 +11,7 @@ import fr.free.nrw.commons.upload.GpsCategoryModel
import io.reactivex.Single
import io.reactivex.subjects.BehaviorSubject
import media
import org.junit.Assert
import org.junit.Before
import org.junit.Test
import org.mockito.ArgumentMatchers
@ -331,4 +332,42 @@ class CategoriesModelTest {
media(),
)
}
@Test
fun `test valid input with XXXX in it between the expected range 20XX`() {
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014)")
Assert.assertFalse(input)
}
@Test
fun `test valid input with XXXXs in it between the expected range 20XXs`() {
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014s)")
Assert.assertFalse(input)
}
@Test
fun `test invalid category when have needing in the input`() {
val input = categoriesModel.isSpammyCategory("Media needing categories as of 30 March 2017")
Assert.assertTrue(input)
}
@Test
fun `test invalid category when have taken on in the input`() {
val input = categoriesModel.isSpammyCategory("Photographs taken on 2015-12-08")
Assert.assertTrue(input)
}
@Test
fun `test invalid category when have yy mm or yy mm dd in the input`() {
// filtering based on [., /, -] separators between the dates.
val input = categoriesModel.isSpammyCategory("Image class 09.14")
Assert.assertTrue(input)
}
@Test
fun `test invalid category when have years not in 20XX range`() {
val input = categoriesModel.isSpammyCategory("Japan in the 1400s")
Assert.assertTrue(input)
}
}