mirror of
https://github.com/aljazceru/ditto.git
synced 2026-02-08 06:54:20 +01:00
feat: make lande great again
create detectLanguage() function that removes emojis, links and other weird invisible characters
This commit is contained in:
28
src/utils/language.test.ts
Normal file
28
src/utils/language.test.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { detectLanguage } from '@/utils/language.ts';
|
||||
import { assertEquals } from '@std/assert';
|
||||
|
||||
Deno.test('Detect English language', () => {
|
||||
assertEquals(detectLanguage(``, 0.90), undefined);
|
||||
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
});
|
||||
34
src/utils/language.ts
Normal file
34
src/utils/language.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
||||
import lande from 'lande';
|
||||
import linkify from 'linkifyjs';
|
||||
|
||||
linkify.registerCustomProtocol('nostr', true);
|
||||
|
||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||
*/
|
||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||
// It's better to remove the emojis first
|
||||
const sanitizedText = (linkify.tokenize(
|
||||
text.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||
)
|
||||
.reduce(
|
||||
(acc, { t, v }) => t === 'text' ? acc + v : acc,
|
||||
'',
|
||||
)).trim();
|
||||
if (sanitizedText.length < 10) return; // heuristics
|
||||
|
||||
const [topResult] = lande(
|
||||
sanitizedText,
|
||||
);
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
|
||||
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
||||
return locale.language as LanguageCode;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
Reference in New Issue
Block a user