feat: move title extraction into commons package

Signed-off-by: Tilman Vatteroth <git@tilmanvatteroth.de>
2024-11-24 02:36:31 -05:00 · 2023-04-08 21:31:27 +02:00 · 2023-04-08 21:31:27 +02:00 · 3962cafa5d
commit 3962cafa5d
parent 8de8a50bec
6 changed files with 124 additions and 36 deletions
--- a/commons/package.json
+++ b/commons/package.json
@ -38,6 +38,7 @@
    "url": "https://github.com/hedgedoc/hedgedoc.git"
  },
  "dependencies": {
    "domhandler": "5.0.3",
    "eventemitter2": "6.4.9",
    "isomorphic-ws": "5.0.0",
    "reveal.js": "4.5.0",
--- a/commons/src/index.ts
+++ b/commons/src/index.ts
@ -32,3 +32,5 @@ export * from './title-extraction/generate-note-title.js'
 export * from './title-extraction/types/iso6391.js'
 export * from './title-extraction/types/frontmatter.js'
 export * from './title-extraction/types/slide-show-options.js'
 export { extractFirstHeading } from './title-extraction/extract-first-heading.js'
--- a/commons/src/title-extraction/extract-first-heading.spec.ts
+++ b/commons/src/title-extraction/extract-first-heading.spec.ts
@ -0,0 +1,60 @@
 /*
 * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
 *
 * SPDX-License-Identifier: AGPL-3.0-only
 */
 import { extractFirstHeading } from './extract-first-heading.js'
 import { describe, expect, it } from '@jest/globals'
 import { Document, Element, Text } from 'domhandler'
 describe('extract first heading', () => {
  describe.each([1, 2, 3, 4, 5, 6])('h%d', (headlineIndex) => {
    it('extracts plain text', () => {
      const content = `headline${headlineIndex}`
      const headline = new Element(`h${headlineIndex}`, {}, [new Text(content)])
      const document = new Document([headline])
      expect(extractFirstHeading(document)).toBe(content)
    })
    it("doesn't extract heading-anchor", () => {
      const headline = new Element(`h${headlineIndex}`, {}, [
        new Element('a', { class: 'class1 heading-anchor class2' }, [
          new Text('invalid link content')
        ])
      ])
      const document = new Document([headline])
      expect(extractFirstHeading(document)).toBe('')
    })
    it('extracts nested texts', () => {
      const headline = new Element(`h${headlineIndex}`, {}, [
        new Element('a', {}, [
          new Text('Valid'),
          new Element('div', {}, [new Text('Text')]),
          new Text(`${headlineIndex}`)
        ])
      ])
      const document = new Document([headline])
      expect(extractFirstHeading(document)).toBe(`ValidText${headlineIndex}`)
    })
    it('extracts image alt texts', () => {
      const headline = new Element(`h${headlineIndex}`, {}, [
        new Element('img', { alt: 'Image Alt' })
      ])
      const document = new Document([headline])
      expect(extractFirstHeading(document)).toBe('Image Alt')
    })
    it('extracts only the first found headline', () => {
      const headline1 = new Element(`h${headlineIndex}`, {}, [
        new Text(`headline${headlineIndex}`)
      ])
      const headline2 = new Element(`h${headlineIndex}`, {}, [
        new Text('headline1')
      ])
      const document = new Document([headline1, headline2])
      expect(extractFirstHeading(document)).toBe(`headline${headlineIndex}`)
    })
  })
 })
--- a/commons/src/title-extraction/extract-first-heading.ts
+++ b/commons/src/title-extraction/extract-first-heading.ts
@ -0,0 +1,57 @@
 /*
 * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
 *
 * SPDX-License-Identifier: AGPL-3.0-only
 */
 import { Element, isTag, isText, Node, NodeWithChildren } from 'domhandler'
 const headlineTagRegex = /^h[1-6]$/gi
 /**
 * Extracts the text content of the first top level headline tag.
 *
 * @param nodes The node whose children should be checked for the headline
 * @return the plain text representation of the first headline. {@code undefined} if no headline has been found.
 */
 export function extractFirstHeading(
  nodes: NodeWithChildren
 ): string | undefined {
  const foundHeadlineNode = checkNodesForHeadline(nodes.children)
  if (!foundHeadlineNode) {
    return
  }
  return extractInnerTextFromNode(foundHeadlineNode).trim()
 }
 function checkNodesForHeadline(nodes: Node[]): Node | undefined {
  return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
 }
 function extractInnerTextFromNode(node: Node): string {
  if (isText(node)) {
    return node.nodeValue
  } else if (isTag(node)) {
    return extractInnerTextFromTag(node)
  } else {
    return ''
  }
 }
 function extractInnerTextFromTag(node: Element): string {
  if (
    node.name === 'a' &&
    findAttribute(node, 'class')?.value.split(' ').includes('heading-anchor')
  ) {
    return ''
  } else if (node.name === 'img') {
    return findAttribute(node, 'alt')?.value ?? ''
  } else {
    return node.children.reduce((state, child) => {
      return state + extractInnerTextFromNode(child)
    }, '')
  }
 }
 function findAttribute(node: Element, attributeName: string) {
  return node.attributes.find((attribute) => attribute.name === attributeName)
 }
--- a/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts
+++ b/frontend/src/components/markdown-renderer/extensions/extract-first-headline/extract-first-headline-node-processor.ts
@ -4,13 +4,11 @@
 * SPDX-License-Identifier: AGPL-3.0-only
 */
 import { NodeProcessor } from '../../node-preprocessors/node-processor'
 import { extractFirstHeading } from '@hedgedoc/commons'
 import { Optional } from '@mrdrogdrog/optional'
-import type { Document, Node, Element } from 'domhandler'
+import type { Document } from 'domhandler'
 import { isTag, isText } from 'domhandler'
 import type { EventEmitter2 } from 'eventemitter2'
 const headlineTagRegex = /^h[1-6]$/gi
 /**
 * Searches for the first headline tag and extracts its plain text content.
 */
@ -22,40 +20,9 @@ export class ExtractFirstHeadlineNodeProcessor extends NodeProcessor {
  }
  process(nodes: Document): Document {
-    Optional.ofNullable(this.checkNodesForHeadline(nodes.children))
+    Optional.ofNullable(extractFirstHeading(nodes))
      .map((foundHeadlineNode) => this.extractInnerTextFromNode(foundHeadlineNode).trim())
      .filter((text) => text !== '')
      .ifPresent((text) => this.eventEmitter.emit(ExtractFirstHeadlineNodeProcessor.EVENT_NAME, text))
    return nodes
  }
  private checkNodesForHeadline(nodes: Node[]): Node | undefined {
    return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
  }
  private extractInnerTextFromNode(node: Node): string {
    if (isText(node)) {
      return node.nodeValue
    } else if (isTag(node)) {
      return this.extractInnerTextFromTag(node)
    } else {
      return ''
    }
  }
  private extractInnerTextFromTag(node: Element): string {
    if (node.name === 'a' && this.findAttribute(node, 'class')?.value.includes('heading-anchor')) {
      return ''
    } else if (node.name === 'img') {
      return this.findAttribute(node, 'alt')?.value ?? ''
    } else {
      return node.children.reduce((state, child) => {
        return state + this.extractInnerTextFromNode(child)
      }, '')
    }
  }
  private findAttribute(node: Element, attributeName: string) {
    return node.attributes.find((attribute) => attribute.name === attributeName)
  }
 }
--- a/yarn.lock
+++ b/yarn.lock
@ -2323,6 +2323,7 @@ __metadata:
    "@types/ws": 8.5.4
    "@typescript-eslint/eslint-plugin": 5.58.0
    "@typescript-eslint/parser": 5.58.0
    domhandler: 5.0.3
    eslint: 8.38.0
    eslint-config-prettier: 8.8.0
    eslint-plugin-jest: 27.2.1