mirror of
https://github.com/hedgedoc/hedgedoc.git
synced 2024-11-25 03:06:31 -05:00
feat: move title extraction into commons package
Signed-off-by: Tilman Vatteroth <git@tilmanvatteroth.de>
This commit is contained in:
parent
64e3513988
commit
c6ce0bc704
6 changed files with 124 additions and 36 deletions
|
@ -38,6 +38,7 @@
|
||||||
"url": "https://github.com/hedgedoc/hedgedoc.git"
|
"url": "https://github.com/hedgedoc/hedgedoc.git"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"domhandler": "5.0.3",
|
||||||
"eventemitter2": "6.4.9",
|
"eventemitter2": "6.4.9",
|
||||||
"isomorphic-ws": "5.0.0",
|
"isomorphic-ws": "5.0.0",
|
||||||
"reveal.js": "4.5.0",
|
"reveal.js": "4.5.0",
|
||||||
|
|
|
@ -32,3 +32,5 @@ export * from './title-extraction/generate-note-title.js'
|
||||||
export * from './title-extraction/types/iso6391.js'
|
export * from './title-extraction/types/iso6391.js'
|
||||||
export * from './title-extraction/types/frontmatter.js'
|
export * from './title-extraction/types/frontmatter.js'
|
||||||
export * from './title-extraction/types/slide-show-options.js'
|
export * from './title-extraction/types/slide-show-options.js'
|
||||||
|
|
||||||
|
export { extractFirstHeading } from './title-extraction/extract-first-heading.js'
|
||||||
|
|
60
commons/src/title-extraction/extract-first-heading.spec.ts
Normal file
60
commons/src/title-extraction/extract-first-heading.spec.ts
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
/*
|
||||||
|
* SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-only
|
||||||
|
*/
|
||||||
|
import { extractFirstHeading } from './extract-first-heading.js'
|
||||||
|
import { describe, expect, it } from '@jest/globals'
|
||||||
|
import { Document, Element, Text } from 'domhandler'
|
||||||
|
|
||||||
|
describe('extract first heading', () => {
|
||||||
|
describe.each([1, 2, 3, 4, 5, 6])('h%d', (headlineIndex) => {
|
||||||
|
it('extracts plain text', () => {
|
||||||
|
const content = `headline${headlineIndex}`
|
||||||
|
const headline = new Element(`h${headlineIndex}`, {}, [new Text(content)])
|
||||||
|
const document = new Document([headline])
|
||||||
|
expect(extractFirstHeading(document)).toBe(content)
|
||||||
|
})
|
||||||
|
|
||||||
|
it("doesn't extract heading-anchor", () => {
|
||||||
|
const headline = new Element(`h${headlineIndex}`, {}, [
|
||||||
|
new Element('a', { class: 'class1 heading-anchor class2' }, [
|
||||||
|
new Text('invalid link content')
|
||||||
|
])
|
||||||
|
])
|
||||||
|
const document = new Document([headline])
|
||||||
|
expect(extractFirstHeading(document)).toBe('')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('extracts nested texts', () => {
|
||||||
|
const headline = new Element(`h${headlineIndex}`, {}, [
|
||||||
|
new Element('a', {}, [
|
||||||
|
new Text('Valid'),
|
||||||
|
new Element('div', {}, [new Text('Text')]),
|
||||||
|
new Text(`${headlineIndex}`)
|
||||||
|
])
|
||||||
|
])
|
||||||
|
const document = new Document([headline])
|
||||||
|
expect(extractFirstHeading(document)).toBe(`ValidText${headlineIndex}`)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('extracts image alt texts', () => {
|
||||||
|
const headline = new Element(`h${headlineIndex}`, {}, [
|
||||||
|
new Element('img', { alt: 'Image Alt' })
|
||||||
|
])
|
||||||
|
const document = new Document([headline])
|
||||||
|
expect(extractFirstHeading(document)).toBe('Image Alt')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('extracts only the first found headline', () => {
|
||||||
|
const headline1 = new Element(`h${headlineIndex}`, {}, [
|
||||||
|
new Text(`headline${headlineIndex}`)
|
||||||
|
])
|
||||||
|
const headline2 = new Element(`h${headlineIndex}`, {}, [
|
||||||
|
new Text('headline1')
|
||||||
|
])
|
||||||
|
const document = new Document([headline1, headline2])
|
||||||
|
expect(extractFirstHeading(document)).toBe(`headline${headlineIndex}`)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
57
commons/src/title-extraction/extract-first-heading.ts
Normal file
57
commons/src/title-extraction/extract-first-heading.ts
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
/*
|
||||||
|
* SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-only
|
||||||
|
*/
|
||||||
|
import { Element, isTag, isText, Node, NodeWithChildren } from 'domhandler'
|
||||||
|
|
||||||
|
const headlineTagRegex = /^h[1-6]$/gi
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the text content of the first top level headline tag.
|
||||||
|
*
|
||||||
|
* @param nodes The node whose children should be checked for the headline
|
||||||
|
* @return the plain text representation of the first headline. {@code undefined} if no headline has been found.
|
||||||
|
*/
|
||||||
|
export function extractFirstHeading(
|
||||||
|
nodes: NodeWithChildren
|
||||||
|
): string | undefined {
|
||||||
|
const foundHeadlineNode = checkNodesForHeadline(nodes.children)
|
||||||
|
if (!foundHeadlineNode) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return extractInnerTextFromNode(foundHeadlineNode).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkNodesForHeadline(nodes: Node[]): Node | undefined {
|
||||||
|
return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractInnerTextFromNode(node: Node): string {
|
||||||
|
if (isText(node)) {
|
||||||
|
return node.nodeValue
|
||||||
|
} else if (isTag(node)) {
|
||||||
|
return extractInnerTextFromTag(node)
|
||||||
|
} else {
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractInnerTextFromTag(node: Element): string {
|
||||||
|
if (
|
||||||
|
node.name === 'a' &&
|
||||||
|
findAttribute(node, 'class')?.value.split(' ').includes('heading-anchor')
|
||||||
|
) {
|
||||||
|
return ''
|
||||||
|
} else if (node.name === 'img') {
|
||||||
|
return findAttribute(node, 'alt')?.value ?? ''
|
||||||
|
} else {
|
||||||
|
return node.children.reduce((state, child) => {
|
||||||
|
return state + extractInnerTextFromNode(child)
|
||||||
|
}, '')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function findAttribute(node: Element, attributeName: string) {
|
||||||
|
return node.attributes.find((attribute) => attribute.name === attributeName)
|
||||||
|
}
|
|
@ -4,13 +4,11 @@
|
||||||
* SPDX-License-Identifier: AGPL-3.0-only
|
* SPDX-License-Identifier: AGPL-3.0-only
|
||||||
*/
|
*/
|
||||||
import { NodeProcessor } from '../../node-preprocessors/node-processor'
|
import { NodeProcessor } from '../../node-preprocessors/node-processor'
|
||||||
|
import { extractFirstHeading } from '@hedgedoc/commons'
|
||||||
import { Optional } from '@mrdrogdrog/optional'
|
import { Optional } from '@mrdrogdrog/optional'
|
||||||
import type { Document, Node, Element } from 'domhandler'
|
import type { Document } from 'domhandler'
|
||||||
import { isTag, isText } from 'domhandler'
|
|
||||||
import type { EventEmitter2 } from 'eventemitter2'
|
import type { EventEmitter2 } from 'eventemitter2'
|
||||||
|
|
||||||
const headlineTagRegex = /^h[1-6]$/gi
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Searches for the first headline tag and extracts its plain text content.
|
* Searches for the first headline tag and extracts its plain text content.
|
||||||
*/
|
*/
|
||||||
|
@ -22,40 +20,9 @@ export class ExtractFirstHeadlineNodeProcessor extends NodeProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
process(nodes: Document): Document {
|
process(nodes: Document): Document {
|
||||||
Optional.ofNullable(this.checkNodesForHeadline(nodes.children))
|
Optional.ofNullable(extractFirstHeading(nodes))
|
||||||
.map((foundHeadlineNode) => this.extractInnerTextFromNode(foundHeadlineNode).trim())
|
|
||||||
.filter((text) => text !== '')
|
.filter((text) => text !== '')
|
||||||
.ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text))
|
.ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text))
|
||||||
return nodes
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
private checkNodesForHeadline(nodes: Node[]): Node | undefined {
|
|
||||||
return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
|
|
||||||
}
|
|
||||||
|
|
||||||
private extractInnerTextFromNode(node: Node): string {
|
|
||||||
if (isText(node)) {
|
|
||||||
return node.nodeValue
|
|
||||||
} else if (isTag(node)) {
|
|
||||||
return this.extractInnerTextFromTag(node)
|
|
||||||
} else {
|
|
||||||
return ''
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private extractInnerTextFromTag(node: Element): string {
|
|
||||||
if (node.name === 'a' && this.findAttribute(node, 'class')?.value.includes('heading-anchor')) {
|
|
||||||
return ''
|
|
||||||
} else if (node.name === 'img') {
|
|
||||||
return this.findAttribute(node, 'alt')?.value ?? ''
|
|
||||||
} else {
|
|
||||||
return node.children.reduce((state, child) => {
|
|
||||||
return state + this.extractInnerTextFromNode(child)
|
|
||||||
}, '')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private findAttribute(node: Element, attributeName: string) {
|
|
||||||
return node.attributes.find((attribute) => attribute.name === attributeName)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2323,6 +2323,7 @@ __metadata:
|
||||||
"@types/ws": 8.5.4
|
"@types/ws": 8.5.4
|
||||||
"@typescript-eslint/eslint-plugin": 5.58.0
|
"@typescript-eslint/eslint-plugin": 5.58.0
|
||||||
"@typescript-eslint/parser": 5.58.0
|
"@typescript-eslint/parser": 5.58.0
|
||||||
|
domhandler: 5.0.3
|
||||||
eslint: 8.38.0
|
eslint: 8.38.0
|
||||||
eslint-config-prettier: 8.8.0
|
eslint-config-prettier: 8.8.0
|
||||||
eslint-plugin-jest: 27.2.1
|
eslint-plugin-jest: 27.2.1
|
||||||
|
|
Loading…
Reference in a new issue