Don't allow matches from shows only matching 'the'

master
Nick Krichevsky 2021-08-25 23:36:15 -04:00
parent 2f4695510a
commit 07440d72d0
5 changed files with 128 additions and 8 deletions

View File

@ -2,6 +2,8 @@ package match
import (
"errors"
"strings"
"unicode"
"github.com/agnivade/levenshtein"
)
@ -17,10 +19,12 @@ type match struct {
func FindBestMatch(candidate string, index []string) (string, error) {
haveCommonPrefix := []string{}
for _, indexEntry := range index {
prefix := getLongestCommonPrefix(candidate, indexEntry)
// Single character prefixes aren't that helpful.
// I don't want to get into threshold territory by playing with a number higher than this, though
if len(prefix) > 1 {
prefix := getLongestCommonPrefix(
normalizeForMatch(candidate),
normalizeForMatch(indexEntry),
)
if isPrefixMeaningful(prefix) {
haveCommonPrefix = append(haveCommonPrefix, indexEntry)
}
}
@ -61,3 +65,24 @@ func findMinScoreMatch(matches []match) match {
return minScorematch
}
func normalizeForMatch(target string) string {
return filterChars(target, func(r rune) bool {
return !isCharLikelySeparator(r)
})
}
func filterChars(target string, shouldAllowChar func(s rune) bool) string {
builder := strings.Builder{}
for _, char := range target {
if shouldAllowChar(char) {
builder.WriteRune(char)
}
}
return builder.String()
}
func isCharLikelySeparator(r rune) bool {
return !unicode.IsNumber(r) && !unicode.IsLetter(r)
}

View File

@ -78,6 +78,13 @@ func TestFindBestMatch(t *testing.T) {
expectedMatch: "",
expectedError: ErrNoMatches,
},
{
// two have a common prefix of "the-a" but really there should be no match
candidate: "the-amazing-gymnist",
index: []string{"the-athlete", "sitting-at-home", "the-show-about-nothing"},
expectedMatch: "",
expectedError: ErrNoMatches,
},
}
for _, test := range tt {
@ -89,7 +96,7 @@ func TestFindBestMatch(t *testing.T) {
t.Run(testName, func(t *testing.T) {
match, err := FindBestMatch(test.candidate, test.index)
if test.expectedError != nil {
assert.ErrorIs(t, err, ErrNoMatches, fmt.Sprintf("got %s", match))
assert.ErrorIs(t, err, ErrNoMatches, fmt.Sprintf("got match %q and error %q", match, err))
} else {
assert.Nil(t, err)
assert.Equal(t, test.expectedMatch, match)

View File

@ -1,5 +1,9 @@
package match
import (
"strings"
)
func getLongestCommonPrefix(a, b string) string {
shortestString, longestString := getShortestAndLongestString(a, b)
for i := range shortestString {
@ -19,3 +23,38 @@ func getShortestAndLongestString(a, b string) (string, string) {
return a, b
}
}
// isPrefixMeaningful checks if the given of a match is truly meaningful enough to allow it to be considered a
// good enough match to continue processing. For instance, a longest common prefix of a single letter is not really
// all that meaningful.
func isPrefixMeaningful(prefix string) bool {
// Single character prefixes aren't that helpful.
// I don't want to get into threshold territory by playing with a number higher than this, though
if len(prefix) <= 1 {
return false
}
// A match whose prefix is "the" (possibly followed by a separator) is probably not that useful of a match
return isPrefixMeaningfulWithoutThe(prefix)
}
// Check if the start of a given prefix is 'the', and if it is, whether removing it would still procure a
// meaningful match
func isPrefixMeaningfulWithoutThe(prefix string) bool {
if len(prefix) < 3 {
return false
}
startsWithThe := strings.ToLower(prefix[:3]) == "the"
if !startsWithThe {
return true
}
trimmedPrefix := strings.TrimFunc(prefix, isCharLikelySeparator)
if len(trimmedPrefix) <= 3 {
return false
}
prefixWithoutThe := strings.TrimFunc(trimmedPrefix[3:], isCharLikelySeparator)
return isPrefixMeaningful(prefixWithoutThe)
}

View File

@ -39,3 +39,40 @@ func TestLongestCommonPrefix(t *testing.T) {
assert.Equal(t, test.expectedPrefix, prefix)
}
}
func TestIsPrefixMeaningful(t *testing.T) {
tt := []struct {
prefix string
expected bool
}{
{
prefix: "Brooklyn",
expected: true,
},
{
prefix: "b",
expected: false,
},
{
prefix: "The",
expected: false,
},
{
prefix: "The-",
expected: false,
},
{
prefix: "The Runner",
expected: true,
},
{
prefix: "The R",
expected: false,
},
}
for _, test := range tt {
isMeaningful := isPrefixMeaningful(test.prefix)
assert.Equal(t, test.expected, isMeaningful)
}
}

View File

@ -46,15 +46,27 @@ func TestFindBestFolder(t *testing.T) {
"",
match.ErrNoMatches,
},
{
"A match with a non-meaningful prefix ('the') should not match",
"The.Amazing.Spider-Man.1977.S01.VHSRip",
[]string{"The-Good-Place", "The-Simpsons", "Stranger-Things", "Brooklyn-Nine-Nine", "The-IT-Crowd"},
"",
match.ErrNoMatches,
},
{
"A good match starting with 'the' should still match",
"The.Simpsons.S06.720p.DSNP.WEB-DL.DDP5.1.H.264-MZABI",
[]string{"The-Good-Place", "The-Simpsons", "Stranger-Things", "Brooklyn-Nine-Nine", "The-IT-Crowd"},
"The-Simpsons",
nil,
},
}
for _, test := range table {
t.Run(test.testName, func(t *testing.T) {
foundFolder, err := findShowFolder(test.fileName, test.folders)
if test.expectedError != nil {
assert.ErrorIs(t, err, test.expectedError, fmt.Sprintf("got %s", foundFolder))
assert.ErrorIs(t, err, test.expectedError, fmt.Sprintf("got show %q and error %q", foundFolder, err))
} else {
assert.Nil(t, err)
assert.Equal(t, test.expectedFolder, foundFolder)