diff --git a/categorize/match/match.go b/categorize/match/match.go index c2359bf..8fb77ee 100644 --- a/categorize/match/match.go +++ b/categorize/match/match.go @@ -2,6 +2,8 @@ package match import ( "errors" + "strings" + "unicode" "github.com/agnivade/levenshtein" ) @@ -17,10 +19,12 @@ type match struct { func FindBestMatch(candidate string, index []string) (string, error) { haveCommonPrefix := []string{} for _, indexEntry := range index { - prefix := getLongestCommonPrefix(candidate, indexEntry) - // Single character prefixes aren't that helpful. - // I don't want to get into threshold territory by playing with a number higher than this, though - if len(prefix) > 1 { + prefix := getLongestCommonPrefix( + normalizeForMatch(candidate), + normalizeForMatch(indexEntry), + ) + + if isPrefixMeaningful(prefix) { haveCommonPrefix = append(haveCommonPrefix, indexEntry) } } @@ -61,3 +65,24 @@ func findMinScoreMatch(matches []match) match { return minScorematch } + +func normalizeForMatch(target string) string { + return filterChars(target, func(r rune) bool { + return !isCharLikelySeparator(r) + }) +} + +func filterChars(target string, shouldAllowChar func(s rune) bool) string { + builder := strings.Builder{} + for _, char := range target { + if shouldAllowChar(char) { + builder.WriteRune(char) + } + } + + return builder.String() +} + +func isCharLikelySeparator(r rune) bool { + return !unicode.IsNumber(r) && !unicode.IsLetter(r) +} diff --git a/categorize/match/match_test.go b/categorize/match/match_test.go index 07ad4f6..f638dbf 100644 --- a/categorize/match/match_test.go +++ b/categorize/match/match_test.go @@ -78,6 +78,13 @@ func TestFindBestMatch(t *testing.T) { expectedMatch: "", expectedError: ErrNoMatches, }, + { + // two have a common prefix of "the-a" but really there should be no match + candidate: "the-amazing-gymnist", + index: []string{"the-athlete", "sitting-at-home", "the-show-about-nothing"}, + expectedMatch: "", + expectedError: ErrNoMatches, + }, } for _, test := range tt { @@ -89,7 +96,7 @@ func TestFindBestMatch(t *testing.T) { t.Run(testName, func(t *testing.T) { match, err := FindBestMatch(test.candidate, test.index) if test.expectedError != nil { - assert.ErrorIs(t, err, ErrNoMatches, fmt.Sprintf("got %s", match)) + assert.ErrorIs(t, err, ErrNoMatches, fmt.Sprintf("got match %q and error %q", match, err)) } else { assert.Nil(t, err) assert.Equal(t, test.expectedMatch, match) diff --git a/categorize/match/prefix.go b/categorize/match/prefix.go index 3fdef60..76c09fe 100644 --- a/categorize/match/prefix.go +++ b/categorize/match/prefix.go @@ -1,5 +1,9 @@ package match +import ( + "strings" +) + func getLongestCommonPrefix(a, b string) string { shortestString, longestString := getShortestAndLongestString(a, b) for i := range shortestString { @@ -19,3 +23,38 @@ func getShortestAndLongestString(a, b string) (string, string) { return a, b } } + +// isPrefixMeaningful checks if the given of a match is truly meaningful enough to allow it to be considered a +// good enough match to continue processing. For instance, a longest common prefix of a single letter is not really +// all that meaningful. +func isPrefixMeaningful(prefix string) bool { + // Single character prefixes aren't that helpful. + // I don't want to get into threshold territory by playing with a number higher than this, though + if len(prefix) <= 1 { + return false + } + + // A match whose prefix is "the" (possibly followed by a separator) is probably not that useful of a match + return isPrefixMeaningfulWithoutThe(prefix) +} + +// Check if the start of a given prefix is 'the', and if it is, whether removing it would still procure a +// meaningful match +func isPrefixMeaningfulWithoutThe(prefix string) bool { + if len(prefix) < 3 { + return false + } + + startsWithThe := strings.ToLower(prefix[:3]) == "the" + if !startsWithThe { + return true + } + + trimmedPrefix := strings.TrimFunc(prefix, isCharLikelySeparator) + if len(trimmedPrefix) <= 3 { + return false + } + + prefixWithoutThe := strings.TrimFunc(trimmedPrefix[3:], isCharLikelySeparator) + return isPrefixMeaningful(prefixWithoutThe) +} diff --git a/categorize/match/prefix_test.go b/categorize/match/prefix_test.go index 3a362ab..d151b73 100644 --- a/categorize/match/prefix_test.go +++ b/categorize/match/prefix_test.go @@ -39,3 +39,40 @@ func TestLongestCommonPrefix(t *testing.T) { assert.Equal(t, test.expectedPrefix, prefix) } } + +func TestIsPrefixMeaningful(t *testing.T) { + tt := []struct { + prefix string + expected bool + }{ + { + prefix: "Brooklyn", + expected: true, + }, + { + prefix: "b", + expected: false, + }, + { + prefix: "The", + expected: false, + }, + { + prefix: "The-", + expected: false, + }, + { + prefix: "The Runner", + expected: true, + }, + { + prefix: "The R", + expected: false, + }, + } + + for _, test := range tt { + isMeaningful := isPrefixMeaningful(test.prefix) + assert.Equal(t, test.expected, isMeaningful) + } +} diff --git a/categorize/tv_test.go b/categorize/tv_test.go index bf9c9ba..72de1f4 100644 --- a/categorize/tv_test.go +++ b/categorize/tv_test.go @@ -46,15 +46,27 @@ func TestFindBestFolder(t *testing.T) { "", match.ErrNoMatches, }, + { + "A match with a non-meaningful prefix ('the') should not match", + "The.Amazing.Spider-Man.1977.S01.VHSRip", + []string{"The-Good-Place", "The-Simpsons", "Stranger-Things", "Brooklyn-Nine-Nine", "The-IT-Crowd"}, + "", + match.ErrNoMatches, + }, + { + "A good match starting with 'the' should still match", + "The.Simpsons.S06.720p.DSNP.WEB-DL.DDP5.1.H.264-MZABI", + []string{"The-Good-Place", "The-Simpsons", "Stranger-Things", "Brooklyn-Nine-Nine", "The-IT-Crowd"}, + "The-Simpsons", + nil, + }, } for _, test := range table { t.Run(test.testName, func(t *testing.T) { - foundFolder, err := findShowFolder(test.fileName, test.folders) - if test.expectedError != nil { - assert.ErrorIs(t, err, test.expectedError, fmt.Sprintf("got %s", foundFolder)) + assert.ErrorIs(t, err, test.expectedError, fmt.Sprintf("got show %q and error %q", foundFolder, err)) } else { assert.Nil(t, err) assert.Equal(t, test.expectedFolder, foundFolder)