feat: move title extraction into commons package

Signed-off-by: Tilman Vatteroth <git@tilmanvatteroth.de>
This commit is contained in:
Tilman Vatteroth 2023-04-08 21:31:27 +02:00
parent 64e3513988
commit c6ce0bc704
No known key found for this signature in database
GPG key ID: 42498463316F048B
6 changed files with 124 additions and 36 deletions

View file

@ -38,6 +38,7 @@
"url": "https://github.com/hedgedoc/hedgedoc.git" "url": "https://github.com/hedgedoc/hedgedoc.git"
}, },
"dependencies": { "dependencies": {
"domhandler": "5.0.3",
"eventemitter2": "6.4.9", "eventemitter2": "6.4.9",
"isomorphic-ws": "5.0.0", "isomorphic-ws": "5.0.0",
"reveal.js": "4.5.0", "reveal.js": "4.5.0",

View file

@ -32,3 +32,5 @@ export * from './title-extraction/generate-note-title.js'
export * from './title-extraction/types/iso6391.js' export * from './title-extraction/types/iso6391.js'
export * from './title-extraction/types/frontmatter.js' export * from './title-extraction/types/frontmatter.js'
export * from './title-extraction/types/slide-show-options.js' export * from './title-extraction/types/slide-show-options.js'
export { extractFirstHeading } from './title-extraction/extract-first-heading.js'

View file

@ -0,0 +1,60 @@
/*
* SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
*
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { extractFirstHeading } from './extract-first-heading.js'
import { describe, expect, it } from '@jest/globals'
import { Document, Element, Text } from 'domhandler'
describe('extract first heading', () => {
describe.each([1, 2, 3, 4, 5, 6])('h%d', (headlineIndex) => {
it('extracts plain text', () => {
const content = `headline${headlineIndex}`
const headline = new Element(`h${headlineIndex}`, {}, [new Text(content)])
const document = new Document([headline])
expect(extractFirstHeading(document)).toBe(content)
})
it("doesn't extract heading-anchor", () => {
const headline = new Element(`h${headlineIndex}`, {}, [
new Element('a', { class: 'class1 heading-anchor class2' }, [
new Text('invalid link content')
])
])
const document = new Document([headline])
expect(extractFirstHeading(document)).toBe('')
})
it('extracts nested texts', () => {
const headline = new Element(`h${headlineIndex}`, {}, [
new Element('a', {}, [
new Text('Valid'),
new Element('div', {}, [new Text('Text')]),
new Text(`${headlineIndex}`)
])
])
const document = new Document([headline])
expect(extractFirstHeading(document)).toBe(`ValidText${headlineIndex}`)
})
it('extracts image alt texts', () => {
const headline = new Element(`h${headlineIndex}`, {}, [
new Element('img', { alt: 'Image Alt' })
])
const document = new Document([headline])
expect(extractFirstHeading(document)).toBe('Image Alt')
})
it('extracts only the first found headline', () => {
const headline1 = new Element(`h${headlineIndex}`, {}, [
new Text(`headline${headlineIndex}`)
])
const headline2 = new Element(`h${headlineIndex}`, {}, [
new Text('headline1')
])
const document = new Document([headline1, headline2])
expect(extractFirstHeading(document)).toBe(`headline${headlineIndex}`)
})
})
})

View file

@ -0,0 +1,57 @@
/*
* SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
*
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { Element, isTag, isText, Node, NodeWithChildren } from 'domhandler'
const headlineTagRegex = /^h[1-6]$/gi
/**
* Extracts the text content of the first top level headline tag.
*
* @param nodes The node whose children should be checked for the headline
* @return the plain text representation of the first headline. {@code undefined} if no headline has been found.
*/
export function extractFirstHeading(
nodes: NodeWithChildren
): string | undefined {
const foundHeadlineNode = checkNodesForHeadline(nodes.children)
if (!foundHeadlineNode) {
return
}
return extractInnerTextFromNode(foundHeadlineNode).trim()
}
function checkNodesForHeadline(nodes: Node[]): Node | undefined {
return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
}
function extractInnerTextFromNode(node: Node): string {
if (isText(node)) {
return node.nodeValue
} else if (isTag(node)) {
return extractInnerTextFromTag(node)
} else {
return ''
}
}
function extractInnerTextFromTag(node: Element): string {
if (
node.name === 'a' &&
findAttribute(node, 'class')?.value.split(' ').includes('heading-anchor')
) {
return ''
} else if (node.name === 'img') {
return findAttribute(node, 'alt')?.value ?? ''
} else {
return node.children.reduce((state, child) => {
return state + extractInnerTextFromNode(child)
}, '')
}
}
function findAttribute(node: Element, attributeName: string) {
return node.attributes.find((attribute) => attribute.name === attributeName)
}

View file

@ -4,13 +4,11 @@
* SPDX-License-Identifier: AGPL-3.0-only * SPDX-License-Identifier: AGPL-3.0-only
*/ */
import { NodeProcessor } from '../../node-preprocessors/node-processor' import { NodeProcessor } from '../../node-preprocessors/node-processor'
import { extractFirstHeading } from '@hedgedoc/commons'
import { Optional } from '@mrdrogdrog/optional' import { Optional } from '@mrdrogdrog/optional'
import type { Document, Node, Element } from 'domhandler' import type { Document } from 'domhandler'
import { isTag, isText } from 'domhandler'
import type { EventEmitter2 } from 'eventemitter2' import type { EventEmitter2 } from 'eventemitter2'
const headlineTagRegex = /^h[1-6]$/gi
/** /**
* Searches for the first headline tag and extracts its plain text content. * Searches for the first headline tag and extracts its plain text content.
*/ */
@ -22,40 +20,9 @@ export class ExtractFirstHeadlineNodeProcessor extends NodeProcessor {
} }
process(nodes: Document): Document { process(nodes: Document): Document {
Optional.ofNullable(this.checkNodesForHeadline(nodes.children)) Optional.ofNullable(extractFirstHeading(nodes))
.map((foundHeadlineNode) => this.extractInnerTextFromNode(foundHeadlineNode).trim())
.filter((text) => text !== '') .filter((text) => text !== '')
.ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text)) .ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text))
return nodes return nodes
} }
private checkNodesForHeadline(nodes: Node[]): Node | undefined {
return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
}
private extractInnerTextFromNode(node: Node): string {
if (isText(node)) {
return node.nodeValue
} else if (isTag(node)) {
return this.extractInnerTextFromTag(node)
} else {
return ''
}
}
private extractInnerTextFromTag(node: Element): string {
if (node.name === 'a' && this.findAttribute(node, 'class')?.value.includes('heading-anchor')) {
return ''
} else if (node.name === 'img') {
return this.findAttribute(node, 'alt')?.value ?? ''
} else {
return node.children.reduce((state, child) => {
return state + this.extractInnerTextFromNode(child)
}, '')
}
}
private findAttribute(node: Element, attributeName: string) {
return node.attributes.find((attribute) => attribute.name === attributeName)
}
} }

View file

@ -2323,6 +2323,7 @@ __metadata:
"@types/ws": 8.5.4 "@types/ws": 8.5.4
"@typescript-eslint/eslint-plugin": 5.58.0 "@typescript-eslint/eslint-plugin": 5.58.0
"@typescript-eslint/parser": 5.58.0 "@typescript-eslint/parser": 5.58.0
domhandler: 5.0.3
eslint: 8.38.0 eslint: 8.38.0
eslint-config-prettier: 8.8.0 eslint-config-prettier: 8.8.0
eslint-plugin-jest: 27.2.1 eslint-plugin-jest: 27.2.1