From 58acaa9ae666d4e40c5b68316513a0fcb9200daf Mon Sep 17 00:00:00 2001 From: Renaud Chaput Date: Wed, 23 Aug 2023 08:18:07 +0200 Subject: [PATCH] Better hashtag normalization when processing a post (#26614) --- .../components/__tests__/hashtag_bar.tsx | 15 ++++++++ .../mastodon/components/hashtag_bar.tsx | 34 +++++++++++++------ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/app/javascript/mastodon/components/__tests__/hashtag_bar.tsx b/app/javascript/mastodon/components/__tests__/hashtag_bar.tsx index c7db485d0..1856b7109 100644 --- a/app/javascript/mastodon/components/__tests__/hashtag_bar.tsx +++ b/app/javascript/mastodon/components/__tests__/hashtag_bar.tsx @@ -105,6 +105,21 @@ describe('computeHashtagBarForStatus', () => { ); }); + it('handles server-side normalized tags with accentuated characters', () => { + const status = createStatus( + '

Text

#éaa #Éaa

', + ['eaa'], // The server may normalize the hashtags in the `tags` attribute + ); + + const { hashtagsInBar, statusContentProps } = + computeHashtagBarForStatus(status); + + expect(hashtagsInBar).toEqual(['Éaa']); + expect(statusContentProps.statusContent).toMatchInlineSnapshot( + `"

Text

"`, + ); + }); + it('does not display in bar a hashtag in content with a case difference', () => { const status = createStatus( '

Text #Éaa

#éaa

', diff --git a/app/javascript/mastodon/components/hashtag_bar.tsx b/app/javascript/mastodon/components/hashtag_bar.tsx index 8781c2663..75bd74da0 100644 --- a/app/javascript/mastodon/components/hashtag_bar.tsx +++ b/app/javascript/mastodon/components/hashtag_bar.tsx @@ -23,8 +23,9 @@ export type StatusLike = Record<{ }>; function normalizeHashtag(hashtag: string) { - if (hashtag && hashtag.startsWith('#')) return hashtag.slice(1); - else return hashtag; + return ( + hashtag && hashtag.startsWith('#') ? hashtag.slice(1) : hashtag + ).normalize('NFKC'); } function isNodeLinkHashtag(element: Node): element is HTMLLinkElement { @@ -70,9 +71,16 @@ function uniqueHashtagsWithCaseHandling(hashtags: string[]) { } // Create the collator once, this is much more efficient -const collator = new Intl.Collator(undefined, { sensitivity: 'accent' }); +const collator = new Intl.Collator(undefined, { + sensitivity: 'base', // we use this to emulate the ASCII folding done on the server-side, hopefuly more efficiently +}); + function localeAwareInclude(collection: string[], value: string) { - return collection.find((item) => collator.compare(item, value) === 0); + const normalizedValue = value.normalize('NFKC'); + + return !!collection.find( + (item) => collator.compare(item.normalize('NFKC'), normalizedValue) === 0, + ); } // We use an intermediate function here to make it easier to test @@ -121,11 +129,13 @@ export function computeHashtagBarForStatus(status: StatusLike): { // try to see if the last line is only hashtags let onlyHashtags = true; + const normalizedTagNames = tagNames.map((tag) => tag.normalize('NFKC')); + Array.from(lastChild.childNodes).forEach((node) => { if (isNodeLinkHashtag(node) && node.textContent) { const normalized = normalizeHashtag(node.textContent); - if (!localeAwareInclude(tagNames, normalized)) { + if (!localeAwareInclude(normalizedTagNames, normalized)) { // stop here, this is not a real hashtag, so consider it as text onlyHashtags = false; return; @@ -140,12 +150,14 @@ export function computeHashtagBarForStatus(status: StatusLike): { } }); - const hashtagsInBar = tagNames.filter( - (tag) => - // the tag does not appear at all in the status content, it is an out-of-band tag - !localeAwareInclude(contentHashtags, tag) && - !localeAwareInclude(lastLineHashtags, tag), - ); + const hashtagsInBar = tagNames.filter((tag) => { + const normalizedTag = tag.normalize('NFKC'); + // the tag does not appear at all in the status content, it is an out-of-band tag + return ( + !localeAwareInclude(contentHashtags, normalizedTag) && + !localeAwareInclude(lastLineHashtags, normalizedTag) + ); + }); const isOnlyOneLine = contentWithoutLastLine.content.childElementCount === 0; const hasMedia = status.get('media_attachments').size > 0;