From 3962cafa5d6ad56b5eb1c6c2fc298577ced67e6f Mon Sep 17 00:00:00 2001 From: Tilman Vatteroth Date: Sat, 8 Apr 2023 21:31:27 +0200 Subject: [PATCH] feat: move title extraction into commons package Signed-off-by: Tilman Vatteroth --- commons/package.json | 1 + commons/src/index.ts | 2 + .../extract-first-heading.spec.ts | 60 +++++++++++++++++++ .../title-extraction/extract-first-heading.ts | 57 ++++++++++++++++++ .../extract-first-headline-node-processor.ts | 39 +----------- yarn.lock | 1 + 6 files changed, 124 insertions(+), 36 deletions(-) create mode 100644 commons/src/title-extraction/extract-first-heading.spec.ts create mode 100644 commons/src/title-extraction/extract-first-heading.ts diff --git a/commons/package.json b/commons/package.json index febdbe455..bda7d4d1b 100644 --- a/commons/package.json +++ b/commons/package.json @@ -38,6 +38,7 @@ "url": "https://github.com/hedgedoc/hedgedoc.git" }, "dependencies": { + "domhandler": "5.0.3", "eventemitter2": "6.4.9", "isomorphic-ws": "5.0.0", "reveal.js": "4.5.0", diff --git a/commons/src/index.ts b/commons/src/index.ts index b1aaf5873..acfad12cc 100644 --- a/commons/src/index.ts +++ b/commons/src/index.ts @@ -32,3 +32,5 @@ export * from './title-extraction/generate-note-title.js' export * from './title-extraction/types/iso6391.js' export * from './title-extraction/types/frontmatter.js' export * from './title-extraction/types/slide-show-options.js' + +export { extractFirstHeading } from './title-extraction/extract-first-heading.js' diff --git a/commons/src/title-extraction/extract-first-heading.spec.ts b/commons/src/title-extraction/extract-first-heading.spec.ts new file mode 100644 index 000000000..153e76002 --- /dev/null +++ b/commons/src/title-extraction/extract-first-heading.spec.ts @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file) + * + * SPDX-License-Identifier: AGPL-3.0-only + */ +import { extractFirstHeading } from './extract-first-heading.js' +import { describe, expect, it } from '@jest/globals' +import { Document, Element, Text } from 'domhandler' + +describe('extract first heading', () => { + describe.each([1, 2, 3, 4, 5, 6])('h%d', (headlineIndex) => { + it('extracts plain text', () => { + const content = `headline${headlineIndex}` + const headline = new Element(`h${headlineIndex}`, {}, [new Text(content)]) + const document = new Document([headline]) + expect(extractFirstHeading(document)).toBe(content) + }) + + it("doesn't extract heading-anchor", () => { + const headline = new Element(`h${headlineIndex}`, {}, [ + new Element('a', { class: 'class1 heading-anchor class2' }, [ + new Text('invalid link content') + ]) + ]) + const document = new Document([headline]) + expect(extractFirstHeading(document)).toBe('') + }) + + it('extracts nested texts', () => { + const headline = new Element(`h${headlineIndex}`, {}, [ + new Element('a', {}, [ + new Text('Valid'), + new Element('div', {}, [new Text('Text')]), + new Text(`${headlineIndex}`) + ]) + ]) + const document = new Document([headline]) + expect(extractFirstHeading(document)).toBe(`ValidText${headlineIndex}`) + }) + + it('extracts image alt texts', () => { + const headline = new Element(`h${headlineIndex}`, {}, [ + new Element('img', { alt: 'Image Alt' }) + ]) + const document = new Document([headline]) + expect(extractFirstHeading(document)).toBe('Image Alt') + }) + + it('extracts only the first found headline', () => { + const headline1 = new Element(`h${headlineIndex}`, {}, [ + new Text(`headline${headlineIndex}`) + ]) + const headline2 = new Element(`h${headlineIndex}`, {}, [ + new Text('headline1') + ]) + const document = new Document([headline1, headline2]) + expect(extractFirstHeading(document)).toBe(`headline${headlineIndex}`) + }) + }) +}) diff --git a/commons/src/title-extraction/extract-first-heading.ts b/commons/src/title-extraction/extract-first-heading.ts new file mode 100644 index 000000000..1cfece20c --- /dev/null +++ b/commons/src/title-extraction/extract-first-heading.ts @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file) + * + * SPDX-License-Identifier: AGPL-3.0-only + */ +import { Element, isTag, isText, Node, NodeWithChildren } from 'domhandler' + +const headlineTagRegex = /^h[1-6]$/gi + +/** + * Extracts the text content of the first top level headline tag. + * + * @param nodes The node whose children should be checked for the headline + * @return the plain text representation of the first headline. {@code undefined} if no headline has been found. + */ +export function extractFirstHeading( + nodes: NodeWithChildren +): string | undefined { + const foundHeadlineNode = checkNodesForHeadline(nodes.children) + if (!foundHeadlineNode) { + return + } + return extractInnerTextFromNode(foundHeadlineNode).trim() +} + +function checkNodesForHeadline(nodes: Node[]): Node | undefined { + return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex)) +} + +function extractInnerTextFromNode(node: Node): string { + if (isText(node)) { + return node.nodeValue + } else if (isTag(node)) { + return extractInnerTextFromTag(node) + } else { + return '' + } +} + +function extractInnerTextFromTag(node: Element): string { + if ( + node.name === 'a' && + findAttribute(node, 'class')?.value.split(' ').includes('heading-anchor') + ) { + return '' + } else if (node.name === 'img') { + return findAttribute(node, 'alt')?.value ?? '' + } else { + return node.children.reduce((state, child) => { + return state + extractInnerTextFromNode(child) + }, '') + } +} + +function findAttribute(node: Element, attributeName: string) { + return node.attributes.find((attribute) => attribute.name === attributeName) +} diff --git a/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts b/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts index 00b5de6fe..382fc74b8 100644 --- a/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts +++ b/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts @@ -4,13 +4,11 @@ * SPDX-License-Identifier: AGPL-3.0-only */ import { NodeProcessor } from '../../node-preprocessors/node-processor' +import { extractFirstHeading } from '@hedgedoc/commons' import { Optional } from '@mrdrogdrog/optional' -import type { Document, Node, Element } from 'domhandler' -import { isTag, isText } from 'domhandler' +import type { Document } from 'domhandler' import type { EventEmitter2 } from 'eventemitter2' -const headlineTagRegex = /^h[1-6]$/gi - /** * Searches for the first headline tag and extracts its plain text content. */ @@ -22,40 +20,9 @@ export class ExtractFirstHeadlineNodeProcessor extends NodeProcessor { } process(nodes: Document): Document { - Optional.ofNullable(this.checkNodesForHeadline(nodes.children)) - .map((foundHeadlineNode) => this.extractInnerTextFromNode(foundHeadlineNode).trim()) + Optional.ofNullable(extractFirstHeading(nodes)) .filter((text) => text !== '') .ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text)) return nodes } - - private checkNodesForHeadline(nodes: Node[]): Node | undefined { - return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex)) - } - - private extractInnerTextFromNode(node: Node): string { - if (isText(node)) { - return node.nodeValue - } else if (isTag(node)) { - return this.extractInnerTextFromTag(node) - } else { - return '' - } - } - - private extractInnerTextFromTag(node: Element): string { - if (node.name === 'a' && this.findAttribute(node, 'class')?.value.includes('heading-anchor')) { - return '' - } else if (node.name === 'img') { - return this.findAttribute(node, 'alt')?.value ?? '' - } else { - return node.children.reduce((state, child) => { - return state + this.extractInnerTextFromNode(child) - }, '') - } - } - - private findAttribute(node: Element, attributeName: string) { - return node.attributes.find((attribute) => attribute.name === attributeName) - } } diff --git a/yarn.lock b/yarn.lock index e3b68e955..6eb463ec5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2323,6 +2323,7 @@ __metadata: "@types/ws": 8.5.4 "@typescript-eslint/eslint-plugin": 5.58.0 "@typescript-eslint/parser": 5.58.0 + domhandler: 5.0.3 eslint: 8.38.0 eslint-config-prettier: 8.8.0 eslint-plugin-jest: 27.2.1