feat: replace custom OpenGraph extraction with fetch-opengraph library

- Install fetch-opengraph library for robust OpenGraph extraction
- Replace custom regex patterns and proxy logic with specialized library
- Simplify AddBookmarkModal OpenGraph extraction logic
- Remove fetchRawHtml function from readerService
- Improve reliability and maintainability of metadata extraction
This commit is contained in:
Gigi
2025-10-25 01:14:28 +02:00
parent 92145af2bb
commit 6ac40c8a17
4 changed files with 103 additions and 43 deletions

60
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "boris",
"version": "0.10.19",
"version": "0.10.23",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "boris",
"version": "0.10.19",
"version": "0.10.23",
"dependencies": {
"@fortawesome/fontawesome-svg-core": "^7.1.0",
"@fortawesome/free-regular-svg-icons": "^7.1.0",
@@ -23,6 +23,7 @@
"applesauce-relay": "^4.0.0",
"date-fns": "^4.1.0",
"fast-average-color": "^9.5.0",
"fetch-opengraph": "^1.0.36",
"nostr-tools": "^2.4.0",
"prismjs": "^1.30.0",
"react": "^18.2.0",
@@ -4502,6 +4503,15 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/axios": {
"version": "0.21.4",
"resolved": "https://registry.npmjs.org/axios/-/axios-0.21.4.tgz",
"integrity": "sha512-ut5vewkiu8jjGBdqpM44XxjuCjq9LAKeHVmoVfHVzy8eHgxxq8SbAVQNovDA8mVi05kP0Ea/n/UzcSHcTJQfNg==",
"license": "MIT",
"dependencies": {
"follow-redirects": "^1.14.0"
}
},
"node_modules/babel-plugin-polyfill-corejs2": {
"version": "0.4.14",
"resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.4.14.tgz",
@@ -6171,6 +6181,16 @@
"reusify": "^1.0.4"
}
},
"node_modules/fetch-opengraph": {
"version": "1.0.36",
"resolved": "https://registry.npmjs.org/fetch-opengraph/-/fetch-opengraph-1.0.36.tgz",
"integrity": "sha512-w2Gs64zjL1O86E0I6E26MrxeXpTrR8Y1vWrgupmZN6NXKV8F5I3W0tlh+ZX686jZwxyilWnQjYwgnWpdETdHWw==",
"license": "MIT",
"dependencies": {
"axios": "^0.21.1",
"html-entities": "^2.3.2"
}
},
"node_modules/file-entry-cache": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz",
@@ -6264,6 +6284,26 @@
"dev": true,
"license": "ISC"
},
"node_modules/follow-redirects": {
"version": "1.15.11",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
"funding": [
{
"type": "individual",
"url": "https://github.com/sponsors/RubenVerborgh"
}
],
"license": "MIT",
"engines": {
"node": ">=4.0"
},
"peerDependenciesMeta": {
"debug": {
"optional": true
}
}
},
"node_modules/for-each": {
"version": "0.3.5",
"resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz",
@@ -6896,6 +6936,22 @@
"he": "bin/he"
}
},
"node_modules/html-entities": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.6.0.tgz",
"integrity": "sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/mdevils"
},
{
"type": "patreon",
"url": "https://patreon.com/mdevils"
}
],
"license": "MIT"
},
"node_modules/html-url-attributes": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",

View File

@@ -26,6 +26,7 @@
"applesauce-relay": "^4.0.0",
"date-fns": "^4.1.0",
"fast-average-color": "^9.5.0",
"fetch-opengraph": "^1.0.36",
"nostr-tools": "^2.4.0",
"prismjs": "^1.30.0",
"react": "^18.2.0",

View File

@@ -4,41 +4,37 @@ import { FontAwesomeIcon } from '@fortawesome/react-fontawesome'
import { faTimes, faSpinner } from '@fortawesome/free-solid-svg-icons'
import IconButton from './IconButton'
import { fetchReadableContent } from '../services/readerService'
import { fetch } from 'fetch-opengraph'
interface AddBookmarkModalProps {
onClose: () => void
onSave: (url: string, title?: string, description?: string, tags?: string[]) => Promise<void>
}
// Helper to extract metadata from HTML
function extractMetaTag(html: string, patterns: string[]): string | null {
for (const pattern of patterns) {
const match = html.match(new RegExp(pattern, 'i'))
if (match) return match[1]
}
return null
}
function extractTags(html: string): string[] {
// Helper to extract tags from OpenGraph data
function extractTagsFromOgData(ogData: any): string[] {
const tags: string[] = []
// Extract keywords meta tag
const keywords = extractMetaTag(html, [
'<meta\\s+name=["\'"]keywords["\'"]\\s+content=["\'"]([^"\']+)["\']'
])
if (keywords) {
keywords.split(/[,;]/)
.map(k => k.trim().toLowerCase())
.filter(k => k.length > 0 && k.length < 30)
.forEach(k => tags.push(k))
// Extract keywords from OpenGraph data
if (ogData.keywords) {
ogData.keywords.split(/[,;]/)
.map((k: string) => k.trim().toLowerCase())
.filter((k: string) => k.length > 0 && k.length < 30)
.forEach((k: string) => tags.push(k))
}
// Extract article:tag (multiple possible)
const articleTagRegex = /<meta\s+property=["']article:tag["']\s+content=["']([^"']+)["']/gi
let match
while ((match = articleTagRegex.exec(html)) !== null) {
const tag = match[1].trim().toLowerCase()
if (tag && tag.length < 30) tags.push(tag)
// Extract article:tag from OpenGraph data
if (ogData['article:tag']) {
const articleTags = Array.isArray(ogData['article:tag'])
? ogData['article:tag']
: [ogData['article:tag']]
articleTags.forEach((tag: string) => {
const cleanTag = tag.trim().toLowerCase()
if (cleanTag && cleanTag.length < 30) {
tags.push(cleanTag)
}
})
}
return Array.from(new Set(tags)).slice(0, 5)
@@ -83,17 +79,27 @@ const AddBookmarkModal: React.FC<AddBookmarkModalProps> = ({ onClose, onSave })
fetchTimeoutRef.current = window.setTimeout(async () => {
setIsFetchingMetadata(true)
try {
const content = await fetchReadableContent(normalizedUrl)
lastFetchedUrlRef.current = normalizedUrl
// Fetch both readable content and OpenGraph data in parallel
const [content, ogData] = await Promise.all([
fetchReadableContent(normalizedUrl),
fetch(normalizedUrl).catch(() => null) // Don't fail if OpenGraph fetch fails
])
lastFetchedUrlRef.current = normalizedUrl
let extractedAnything = false
// Extract title: prioritize og:title > twitter:title > <title>
if (!title && content.html) {
const extractedTitle = extractMetaTag(content.html, [
'<meta\\s+property=["\'"]og:title["\'"]\\s+content=["\'"]([^"\']+)["\']',
'<meta\\s+name=["\'"]twitter:title["\'"]\\s+content=["\'"]([^"\']+)["\']'
]) || content.title
// Extract title: prioritize og:title > twitter:title > content.title
if (!title) {
let extractedTitle = null
if (ogData) {
extractedTitle = ogData['og:title'] || ogData['twitter:title'] || ogData.title
}
// Fallback to content.title if no OpenGraph title found
if (!extractedTitle) {
extractedTitle = content.title
}
if (extractedTitle) {
setTitle(extractedTitle)
@@ -102,12 +108,8 @@ const AddBookmarkModal: React.FC<AddBookmarkModalProps> = ({ onClose, onSave })
}
// Extract description: prioritize og:description > twitter:description > meta description
if (!description && content.html) {
const extractedDesc = extractMetaTag(content.html, [
'<meta\\s+property=["\'"]og:description["\'"]\\s+content=["\'"]([^"\']+)["\']',
'<meta\\s+name=["\'"]twitter:description["\'"]\\s+content=["\'"]([^"\']+)["\']',
'<meta\\s+name=["\'"]description["\'"]\\s+content=["\'"]([^"\']+)["\']'
])
if (!description && ogData) {
const extractedDesc = ogData['og:description'] || ogData['twitter:description'] || ogData.description
if (extractedDesc) {
setDescription(extractedDesc)
@@ -116,8 +118,8 @@ const AddBookmarkModal: React.FC<AddBookmarkModalProps> = ({ onClose, onSave })
}
// Extract tags from keywords and article:tag (only if user hasn't modified tags)
if (!tagsInput && content.html) {
const extractedTags = extractTags(content.html)
if (!tagsInput && ogData) {
const extractedTags = extractTagsFromOgData(ogData)
// Only add boris tag if we extracted something
if (extractedAnything || extractedTags.length > 0) {

View File

@@ -110,3 +110,4 @@ export async function fetchReadableContent(
}