overleaf-cep/services/web/frontend/js/features/word-count-modal/utils/segmenters.ts
Alf Eaton fb0cfbe0bb Add more detail to word count UI (#25400)
GitOrigin-RevId: 3521f2ea03332e46ef1bac634ce0650cdce01249
2025-05-14 08:06:40 +00:00

64 lines
1.4 KiB
TypeScript

const wordRe = /['\-.\p{L}]+/gu
const wordLikeRe = /\p{L}/u // must contain at least one "letter" to be a word
const characterRe = /\S/gu
type SegmentDataLike = {
segment: string
isWordLike?: boolean
}
type SegmenterLike = {
segment(input: string): {
[Symbol.iterator](): IterableIterator<SegmentDataLike>
}
}
export type Segmenters = {
word: SegmenterLike
character: SegmenterLike
}
export const createSegmenters = (locale?: string): Segmenters => {
if (!Intl.Segmenter) {
return genericSegmenters
}
try {
return {
word: new Intl.Segmenter(locale, {
granularity: 'word', // TODO: count hyphenated words as a single word
}),
character: new Intl.Segmenter(locale, {
granularity: 'grapheme',
}),
}
} catch {
return genericSegmenters
}
}
const genericSegmenters = {
word: {
segment(input: string) {
const segments: SegmentDataLike[] = []
for (const match of input.matchAll(wordRe)) {
segments.push({
segment: match[0],
isWordLike: wordLikeRe.test(match[0]),
})
}
return segments
},
},
character: {
segment(input: string) {
const segments: SegmentDataLike[] = []
for (const match of input.matchAll(characterRe)) {
segments.push({
segment: match[0],
})
}
return segments
},
},
}