[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

commit 52109776f63ac59b2fef5cd7417becd9f0007acb
parent fece7fa70699d0844764131e65253eb409cfd3d2
Author: ugla <ugla@u8.is>
Date:   Tue, 15 Nov 2022 16:05:34 +0100

[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)

* [bugfix] Fix unicode-unaware word boundary check in hashtag regex

Go `\b` does not care for Unicode, and without lookahead, the workarounds got
very ugly. So I replaced the regex with a parser.

The parser runs in O(n) time and performance should not be affected.

* [bugfix] Add back hashtag max length and add tests for it
Diffstat:
M internal/regexes/regexes.go  | 9 +--------
M internal/text/common.go  | 48 +++++++++++++++++++++++++++++-------------------
M internal/util/statustools.go  | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M internal/util/statustools_test.go  | 44 ++++++++++++++++++++++++++++++++++----------

4 files changed, 146 insertions(+), 45 deletions(-)
diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go
@@ -47,7 +47,6 @@ const (
 const (
 	maximumUsernameLength       = 64
 	maximumEmojiShortcodeLength = 30
-	maximumHashtagLength        = 30
 )
 
 var (
@@ -66,17 +65,11 @@ var (
 	// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
 	MentionName = regexp.MustCompile(mentionName)
 
-	// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1
+	// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1
 	mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
 	// MentionFinder extracts mentions from a piece of text.
 	MentionFinder = regexp.MustCompile(mentionFinder)
 
-	// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
-	hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
-	// HashtagFinder finds possible hashtags in a string.
-	// It returns just the string part of the hashtag, not the # symbol.
-	HashtagFinder = regexp.MustCompile(hashtagFinder)
-
 	emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
 	// EmojiShortcode validates an emoji name.
 	EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
diff --git a/internal/text/common.go b/internal/text/common.go
@@ -27,36 +27,46 @@ import (
 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
 	"github.com/superseriousbusiness/gotosocial/internal/log"
 	"github.com/superseriousbusiness/gotosocial/internal/regexes"
+	"github.com/superseriousbusiness/gotosocial/internal/util"
 )
 
 func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
-	return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string {
-		// we have a match
-		matchTrimmed := strings.TrimSpace(match)
-		tagAsEntered := matchTrimmed[1:]
+	spans := util.FindHashtagSpansInText(in)
+
+	if len(spans) == 0 {
+		return in
+	}
+
+	var b strings.Builder
+	i := 0
+
+spans:
+	for _, t := range spans {
+		b.WriteString(in[i:t.First])
+		i = t.Second
+		tagAsEntered := in[t.First+1 : t.Second]
 
-		// check through the tags to find what we're matching
 		for _, tag := range tags {
 			if strings.EqualFold(tagAsEntered, tag.Name) {
-				// Add any dropped space from match
-				if unicode.IsSpace(rune(match[0])) {
-					buf.WriteByte(match[0])
-				}
-
 				// replace the #tag with the formatted tag content
 				// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a>
-				buf.WriteString(`<a href="`)
-				buf.WriteString(tag.URL)
-				buf.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
-				buf.WriteString(tagAsEntered)
-				buf.WriteString(`</span></a>`)
-				return buf.String()
+				b.WriteString(`<a href="`)
+				b.WriteString(tag.URL)
+				b.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
+				b.WriteString(tagAsEntered)
+				b.WriteString(`</span></a>`)
+				continue spans
 			}
 		}
 
-		// the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes
-		return match
-	})
+		b.WriteString(in[t.First:t.Second])
+	}
+
+	// Get the last bits.
+	i = spans[len(spans)-1].Second
+	b.WriteString(in[i:])
+
+	return b.String()
 }
 
 func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string {
diff --git a/internal/util/statustools.go b/internal/util/statustools.go
@@ -19,11 +19,16 @@
 package util
 
 import (
-	"strings"
+	"unicode"
+	"unicode/utf8"
 
 	"github.com/superseriousbusiness/gotosocial/internal/regexes"
 )
 
+const (
+	maximumHashtagLength = 30
+)
+
 // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
 // and applies a regex to it to return a deduplicated list of account names
 // mentioned in that text, in the format "@user@example.org" or "@username" for
@@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string {
 	return UniqueStrings(mentionedAccounts)
 }
 
-// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text,
-// and applies a regex to it to return a deduplicated list of hashtags
-// used in that text, without the leading #. The case of the returned
-// tags will be lowered, for consistency.
+type Pair[A, B any] struct {
+	First  A
+	Second B
+}
+
+// Byte index in original string
+// `First` includes `#`.
+type Span = Pair[int, int]
+
+// Takes a plaintext (ie., not HTML-formatted) text,
+// and returns a slice of unique hashtags.
 func DeriveHashtagsFromText(text string) []string {
+	tagsMap := make(map[string]bool)
 	tags := []string{}
-	for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) {
-		tags = append(tags, strings.TrimPrefix(m[1], "#"))
+
+	for _, v := range FindHashtagSpansInText(text) {
+		t := text[v.First+1 : v.Second]
+		if _, value := tagsMap[t]; !value {
+			tagsMap[t] = true
+			tags = append(tags, t)
+		}
+	}
+
+	return tags
+}
+
+// Takes a plaintext (ie., not HTML-formatted) text,
+// and returns a list of pairs of indices into the original string, where
+// hashtags are located.
+func FindHashtagSpansInText(text string) []Span {
+	tags := []Span{}
+	start := 0
+	// Keep one rune of lookbehind.
+	prev := ' '
+	inTag := false
+
+	for i, r := range text {
+		if r == '#' && isHashtagBoundary(prev) {
+			// Start of hashtag.
+			inTag = true
+			start = i
+		} else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) {
+			// Inside the hashtag, but it was a phoney, gottem.
+			inTag = false
+		} else if inTag && isHashtagBoundary(r) {
+			// End of hashtag.
+			inTag = false
+			appendTag(&tags, text, start, i)
+		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
+			// End of text.
+			appendTag(&tags, text, start, irl)
+		}
+
+		prev = r
+	}
+
+	return tags
+}
+
+func appendTag(tags *[]Span, text string, start int, end int) {
+	l := end - start - 1
+	// This check could be moved out into the parsing loop if necessary!
+	if 0 < l && l <= maximumHashtagLength {
+		*tags = append(*tags, Span{First: start, Second: end})
 	}
-	return UniqueStrings(tags)
 }
 
 // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
@@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string {
 	}
 	return UniqueStrings(emojis)
 }
+
+func isPermittedInHashtag(r rune) bool {
+	return unicode.IsLetter(r) || unicode.IsNumber(r)
+}
+
+// Decides where to break before or after a hashtag.
+func isHashtagBoundary(r rune) bool {
+	return r == '#' || // `###lol` should work
+		unicode.IsSpace(r) || // All kinds of Unicode whitespace.
+		unicode.IsControl(r) || // All kinds of control characters, like tab.
+		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
+		// But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
+		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
+}
diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go
@@ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() {
 
 # testing this one shouldn't work
 
-			#thisshouldwork
+			#thisshouldwork #dupe #dupe!! #dupe
 
 	here's a link with a fragment: https://example.org/whatever#ahhh
+	here's another link with a fragment: https://example.org/whatever/#ahhh
 
-#ThisShouldAlsoWork #not_this_though
+(#ThisShouldAlsoWork) #not_this_though
 
 #111111 thisalsoshouldn'twork#### ##
 
-#alimentación, #saúde
+#alimentación, #saúde, #lävistää, #ö, #네
+#ThisOneIsThirtyOneCharactersLon...  ...ng
+#ThisOneIsThirteyCharactersLong
 `
 
 	tags := util.DeriveHashtagsFromText(statusText)
-	assert.Len(suite.T(), tags, 7)
+	assert.Len(suite.T(), tags, 12)
 	assert.Equal(suite.T(), "testing123", tags[0])
 	assert.Equal(suite.T(), "also", tags[1])
 	assert.Equal(suite.T(), "thisshouldwork", tags[2])
-	assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3])
-	assert.Equal(suite.T(), "111111", tags[4])
-	assert.Equal(suite.T(), "alimentación", tags[5])
-	assert.Equal(suite.T(), "saúde", tags[6])
+	assert.Equal(suite.T(), "dupe", tags[3])
+	assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4])
+	assert.Equal(suite.T(), "111111", tags[5])
+	assert.Equal(suite.T(), "alimentación", tags[6])
+	assert.Equal(suite.T(), "saúde", tags[7])
+	assert.Equal(suite.T(), "lävistää", tags[8])
+	assert.Equal(suite.T(), "ö", tags[9])
+	assert.Equal(suite.T(), "네", tags[10])
+	assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11])
+
+	statusText = `#올빼미 hej`
+	tags = util.DeriveHashtagsFromText(statusText)
+	assert.Equal(suite.T(), "올빼미", tags[0])
+}
+
+func (suite *StatusTestSuite) TestHashtagSpansOK() {
+	statusText := `#0 #3   #8aa`
+
+	spans := util.FindHashtagSpansInText(statusText)
+	assert.Equal(suite.T(), 0, spans[0].First)
+	assert.Equal(suite.T(), 2, spans[0].Second)
+	assert.Equal(suite.T(), 3, spans[1].First)
+	assert.Equal(suite.T(), 5, spans[1].Second)
+	assert.Equal(suite.T(), 8, spans[2].First)
+	assert.Equal(suite.T(), 12, spans[2].Second)
 }
 
 func (suite *StatusTestSuite) TestDeriveEmojiOK() {
@@ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end
 func (suite *StatusTestSuite) TestDeriveMultiple() {
 	statusText := `Another test @foss_satan@fossbros-anonymous.io
 
-	#Hashtag
+	#HashTag
 
 	Text`
 
@@ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() {
 	assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0])
 
 	assert.Len(suite.T(), hs, 1)
-	assert.Equal(suite.T(), "Hashtag", hs[0])
+	assert.Contains(suite.T(), hs, "HashTag")
 
 	assert.Len(suite.T(), es, 0)
 }

	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	internal/regexes/regexes.go	\|	9	+--------
M	internal/text/common.go	\|	48	+++++++++++++++++++++++++++++-------------------
M	internal/util/statustools.go	\|	90	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	internal/util/statustools_test.go	\|	44	++++++++++++++++++++++++++++++++++----------