From 59fb97d874873a867c6df9c3574e8878322dd03d Mon Sep 17 00:00:00 2001 From: Eric Mc Sween <5454374+emcsween@users.noreply.github.com> Date: Tue, 25 Feb 2025 07:40:10 -0500 Subject: [PATCH] Merge pull request #23819 from overleaf/em-find-projects Rename find_dangling_comments script to check_docs and make improvements GitOrigin-RevId: 3c81c734f4259c5939c190a886b5b90239d03017 --- package-lock.json | 106 +++++++-- services/web/.eslintrc.js | 9 +- services/web/package.json | 1 + services/web/scripts/check_docs.mjs | 221 ++++++++++++++++++ .../web/scripts/find_dangling_comments.mjs | 164 ------------- 5 files changed, 313 insertions(+), 188 deletions(-) create mode 100644 services/web/scripts/check_docs.mjs delete mode 100644 services/web/scripts/find_dangling_comments.mjs diff --git a/package-lock.json b/package-lock.json index 358dc170fd..ac28be494c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31233,16 +31233,16 @@ } }, "node_modules/p-queue": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz", - "integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==", - "dev": true, + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-8.1.0.tgz", + "integrity": "sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==", + "license": "MIT", "dependencies": { "eventemitter3": "^5.0.1", - "p-timeout": "^5.0.2" + "p-timeout": "^6.1.2" }, "engines": { - "node": ">=12" + "node": ">=18" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -31252,15 +31252,15 @@ "version": "5.0.1", "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", - "dev": true + "license": "MIT" }, "node_modules/p-queue/node_modules/p-timeout": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz", - "integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==", - "dev": true, + "version": "6.1.4", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-6.1.4.tgz", + "integrity": "sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg==", + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=14.16" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -44078,6 +44078,7 @@ "otplib": "^12.0.1", "p-limit": "^2.3.0", "p-props": "4.0.0", + "p-queue": "^8.1.0", "parse-data-url": "^2.0.0", "passport": "^0.6.0", "passport-google-oauth20": "^2.0.0", @@ -44767,6 +44768,43 @@ "@uppy/core": "^3.8.0" } }, + "services/web/node_modules/@uppy/provider-views/node_modules/eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", + "dev": true, + "license": "MIT" + }, + "services/web/node_modules/@uppy/provider-views/node_modules/p-queue": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz", + "integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==", + "dev": true, + "license": "MIT", + "dependencies": { + "eventemitter3": "^5.0.1", + "p-timeout": "^5.0.2" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "services/web/node_modules/@uppy/provider-views/node_modules/p-timeout": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz", + "integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "services/web/node_modules/@uppy/react": { "version": "3.2.1", "resolved": "https://registry.npmjs.org/@uppy/react/-/react-3.2.1.tgz", @@ -53118,6 +53156,7 @@ "overleaf-editor-core": "*", "p-limit": "^2.3.0", "p-props": "4.0.0", + "p-queue": "^8.1.0", "parse-data-url": "^2.0.0", "passport": "^0.6.0", "passport-google-oauth20": "^2.0.0", @@ -53574,6 +53613,30 @@ "nanoid": "^4.0.0", "p-queue": "^7.3.4", "preact": "^10.5.13" + }, + "dependencies": { + "eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", + "dev": true + }, + "p-queue": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz", + "integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==", + "dev": true, + "requires": { + "eventemitter3": "^5.0.1", + "p-timeout": "^5.0.2" + } + }, + "p-timeout": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz", + "integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==", + "dev": true + } } }, "@uppy/react": { @@ -72418,26 +72481,23 @@ } }, "p-queue": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz", - "integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==", - "dev": true, + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-8.1.0.tgz", + "integrity": "sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==", "requires": { "eventemitter3": "^5.0.1", - "p-timeout": "^5.0.2" + "p-timeout": "^6.1.2" }, "dependencies": { "eventemitter3": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", - "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", - "dev": true + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==" }, "p-timeout": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz", - "integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==", - "dev": true + "version": "6.1.4", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-6.1.4.tgz", + "integrity": "sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg==" } } }, diff --git a/services/web/.eslintrc.js b/services/web/.eslintrc.js index ee585fcd8b..47d15bca87 100644 --- a/services/web/.eslintrc.js +++ b/services/web/.eslintrc.js @@ -107,7 +107,14 @@ module.exports = { }, plugins: ['unicorn'], rules: { - 'import/no-unresolved': 'error', + 'import/no-unresolved': [ + 'error', + { + // eslint-plugin-import does not support exports directive in package.json + // https://github.com/import-js/eslint-plugin-import/issues/1810 + ignore: ['^p-queue$'], + }, + ], 'import/extensions': [ 'error', 'ignorePackages', diff --git a/services/web/package.json b/services/web/package.json index f943edb915..8b8cdb2841 100644 --- a/services/web/package.json +++ b/services/web/package.json @@ -146,6 +146,7 @@ "otplib": "^12.0.1", "p-limit": "^2.3.0", "p-props": "4.0.0", + "p-queue": "^8.1.0", "parse-data-url": "^2.0.0", "passport": "^0.6.0", "passport-google-oauth20": "^2.0.0", diff --git a/services/web/scripts/check_docs.mjs b/services/web/scripts/check_docs.mjs new file mode 100644 index 0000000000..d6fccd5db3 --- /dev/null +++ b/services/web/scripts/check_docs.mjs @@ -0,0 +1,221 @@ +// @ts-check + +import minimist from 'minimist' +import PQueue from 'p-queue' +import { + db, + ObjectId, + READ_PREFERENCE_SECONDARY, +} from '../app/src/infrastructure/mongodb.js' +import DocstoreManager from '../app/src/Features/Docstore/DocstoreManager.js' +import { NotFoundError } from '../app/src/Features/Errors/Errors.js' + +const OPTS = parseArgs() + +function parseArgs() { + const args = minimist(process.argv.slice(2), { + string: ['min-project-id', 'max-project-id', 'project-modified-since'], + boolean: ['help', 'dangling-comments', 'tracked-changes'], + }) + + if (args.help) { + usage() + process.exit(0) + } + + const danglingComments = Boolean(args['dangling-comments']) + const trackedChanges = Boolean(args['tracked-changes']) + if (!danglingComments && !trackedChanges) { + console.log( + 'At least one of --dangling-comments or --tracked-changes must be enabled' + ) + process.exit(1) + } + + return { + minProjectId: args['min-project-id'] ?? null, + maxProjectId: args['max-project-id'] ?? null, + projectModifiedSince: args['project-modified-since'] + ? new Date(args['project-modified-since']) + : null, + danglingComments, + trackedChanges, + concurrency: parseInt(args.concurrency ?? '1', 10), + } +} + +function usage() { + console.log(`Usage: find_dangling_comments.mjs [OPTS] + +Options: + + --min-project-id Start scanning at this project id + --max-project-id Stop scanning at this project id + --project-modified-since Only consider projects that were modified after the given date + Example: 2020-01-01 + --dangling-comments Report projects with dangling comments + --tracked-changes Report projects with tracked changes + --concurrency How many projects can be processed in parallel + `) +} + +async function main() { + const queue = new PQueue({ concurrency: OPTS.concurrency }) + let projectsProcessed = 0 + let danglingCommentsFound = 0 + let trackedChangesFound = 0 + for await (const projectId of getProjectIds()) { + await queue.onEmpty() + queue.add(async () => { + const docs = await getDocs(projectId) + + if (OPTS.danglingComments) { + const danglingThreadIds = await findDanglingThreadIds(projectId, docs) + if (danglingThreadIds.length > 0) { + console.log( + `Project ${projectId} has dangling threads: ${danglingThreadIds.join(', ')}` + ) + danglingCommentsFound += 1 + } + } + + if (OPTS.trackedChanges) { + if (docsHaveTrackedChanges(docs)) { + console.log(`Project ${projectId} has tracked changes`) + trackedChangesFound += 1 + } + } + + projectsProcessed += 1 + if (projectsProcessed % 100000 === 0) { + console.log( + `${projectsProcessed} projects processed - Last project: ${projectId}` + ) + } + }) + } + await queue.onIdle() + + if (OPTS.danglingComments) { + console.log( + `${danglingCommentsFound} projects with dangling comments found` + ) + } + + if (OPTS.trackedChanges) { + console.log(`${trackedChangesFound} projects with tracked changes found`) + } +} + +function getProjectIds() { + const clauses = [] + + if (OPTS.minProjectId != null) { + clauses.push({ _id: { $gte: new ObjectId(OPTS.minProjectId) } }) + } + + if (OPTS.maxProjectId != null) { + clauses.push({ _id: { $lte: new ObjectId(OPTS.maxProjectId) } }) + } + + if (OPTS.projectModifiedSince) { + clauses.push({ lastUpdated: { $gte: OPTS.projectModifiedSince } }) + } + + const query = clauses.length > 0 ? { $and: clauses } : {} + return db.projects + .find(query, { + projection: { _id: 1 }, + readPreference: READ_PREFERENCE_SECONDARY, + sort: { _id: 1 }, + }) + .map(x => x._id.toString()) +} + +async function getDocs(projectId) { + const mongoDocs = db.docs.find( + { + project_id: new ObjectId(projectId), + deleted: { $ne: true }, + }, + { + projection: { ranges: 1, inS3: 1 }, + readPreference: READ_PREFERENCE_SECONDARY, + } + ) + + const docs = [] + for await (const mongoDoc of mongoDocs) { + if (mongoDoc.inS3) { + try { + const archivedDoc = await DocstoreManager.promises.getDoc( + projectId, + mongoDoc._id, + { peek: true } + ) + docs.push({ + id: mongoDoc._id.toString(), + ranges: archivedDoc.ranges, + }) + } catch (err) { + if (err instanceof NotFoundError) { + console.warn(`Doc ${mongoDoc._id} in project ${projectId} not found`) + } else { + throw err + } + } + } else { + docs.push({ + id: mongoDoc._id.toString(), + ranges: mongoDoc.ranges, + }) + } + } + + return docs +} + +async function findDanglingThreadIds(projectId, docs) { + const threadIds = new Set() + for (const doc of docs) { + const comments = doc.ranges?.comments ?? [] + for (const comment of comments) { + threadIds.add(comment.op.t.toString()) + } + } + + if (threadIds.size === 0) { + return [] + } + + const rooms = await db.rooms.find( + { project_id: new ObjectId(projectId), thread_id: { $exists: true } }, + { readPreference: READ_PREFERENCE_SECONDARY } + ) + for await (const room of rooms) { + threadIds.delete(room.thread_id.toString()) + if (threadIds.size === 0) { + break + } + } + + return Array.from(threadIds) +} + +function docsHaveTrackedChanges(docs) { + for (const doc of docs) { + const changes = doc.ranges?.changes ?? [] + if (changes.length > 0) { + return true + } + } + return false +} + +try { + await main() + process.exit(0) +} catch (err) { + console.error(err) + process.exit(1) +} diff --git a/services/web/scripts/find_dangling_comments.mjs b/services/web/scripts/find_dangling_comments.mjs deleted file mode 100644 index 518def93ed..0000000000 --- a/services/web/scripts/find_dangling_comments.mjs +++ /dev/null @@ -1,164 +0,0 @@ -// @ts-check - -import minimist from 'minimist' -import { - db, - ObjectId, - READ_PREFERENCE_SECONDARY, -} from '../app/src/infrastructure/mongodb.js' -import DocstoreManager from '../app/src/Features/Docstore/DocstoreManager.js' -import { NotFoundError } from '../app/src/Features/Errors/Errors.js' - -const OPTS = parseArgs() - -function parseArgs() { - const args = minimist(process.argv.slice(2), { - string: ['min-project-id', 'max-project-id', 'project-modified-since'], - boolean: ['help'], - }) - - if (args.help) { - usage() - process.exit(0) - } - - return { - minProjectId: args['min-project-id'] ?? null, - maxProjectId: args['max-project-id'] ?? null, - projectModifiedSince: args['project-modified-since'] - ? new Date(args['project-modified-since']) - : null, - } -} - -function usage() { - console.log(`Usage: find_dangling_comments.mjs [OPTS] - -Options: - - --min-project-id Start scanning at this project id - --max-project-id Stop scanning at this project id - --project-modified-since Only consider projects that were modified after the given date - Example: 2020-01-01`) -} - -async function main() { - let projectsProcessed = 0 - let projectsFound = 0 - for await (const projectId of fetchProjectIds()) { - projectsProcessed += 1 - const threadIds = await fetchThreadIds(projectId) - const danglingThreadIds = await findDanglingThreadIds(projectId, threadIds) - if (danglingThreadIds.length > 0) { - console.log( - `Project ${projectId} has dangling threads: ${danglingThreadIds.join(', ')}` - ) - projectsFound += 1 - } - if (projectsProcessed % 1000 === 0) { - console.log( - `${projectsProcessed} projects processed - Last project: ${projectId}` - ) - } - } - console.log(`${projectsFound} projects with dangling comments found`) -} - -function fetchProjectIds() { - const clauses = [] - - if (OPTS.minProjectId != null) { - clauses.push({ project_id: { $gte: new ObjectId(OPTS.minProjectId) } }) - } - - if (OPTS.maxProjectId != null) { - clauses.push({ project_id: { $lte: new ObjectId(OPTS.maxProjectId) } }) - } - - if (OPTS.projectModifiedSince) { - clauses.push({ lastUpdated: { $gte: OPTS.projectModifiedSince } }) - } - - const query = clauses.length > 0 ? { $and: clauses } : {} - return db.projects - .find(query, { - projection: { _id: 1 }, - readPreference: READ_PREFERENCE_SECONDARY, - }) - .map(x => x._id.toString()) -} - -async function fetchThreadIds(projectId) { - const docs = db.docs.find( - { - project_id: new ObjectId(projectId), - deleted: { $ne: true }, - $or: [{ 'ranges.comments.0': { $exists: true } }, { inS3: true }], - }, - { - projection: { 'ranges.comments': 1, inS3: 1 }, - readPreference: READ_PREFERENCE_SECONDARY, - } - ) - - const threadIds = new Set() - for await (const doc of docs) { - let comments = [] - if (doc.inS3) { - try { - const archivedDoc = await DocstoreManager.promises.getDoc( - projectId, - doc._id, - { peek: true } - ) - comments = archivedDoc.ranges?.comments ?? [] - } catch (err) { - if (err instanceof NotFoundError) { - console.warn(`Doc ${doc._id} in project ${projectId} not found`) - } else { - throw err - } - } - } else { - comments = doc.ranges?.comments - } - - for (const comment of comments) { - threadIds.add(comment.op.t.toString()) - } - } - - return threadIds -} - -/** - * @param {string} projectId - * @param {Set} threadIds - */ -async function findDanglingThreadIds(projectId, threadIds) { - const rooms = await db.rooms.find( - { project_id: projectId, thread_id: { $exists: true } }, - { readPreference: READ_PREFERENCE_SECONDARY } - ) - const existingThreadIds = new Set() - for await (const room of rooms) { - existingThreadIds.add(room.thread_id.toString()) - } - - const danglingThreadIds = [] - for (const threadId of threadIds) { - if (!existingThreadIds.has(threadId)) { - danglingThreadIds.push(threadId) - } - } - - return danglingThreadIds -} - -try { - await main() - process.exit(0) -} catch (err) { - console.error(err) - process.exit(1) -}