Merge pull request #23819 from overleaf/em-find-projects

Rename find_dangling_comments script to check_docs and make improvements

GitOrigin-RevId: 3c81c734f4259c5939c190a886b5b90239d03017
This commit is contained in:
Eric Mc Sween 2025-02-25 07:40:10 -05:00 committed by Copybot
parent 04c42f03e1
commit 59fb97d874
5 changed files with 313 additions and 188 deletions

106
package-lock.json generated
View file

@ -31233,16 +31233,16 @@
}
},
"node_modules/p-queue": {
"version": "7.4.1",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz",
"integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==",
"dev": true,
"version": "8.1.0",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-8.1.0.tgz",
"integrity": "sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==",
"license": "MIT",
"dependencies": {
"eventemitter3": "^5.0.1",
"p-timeout": "^5.0.2"
"p-timeout": "^6.1.2"
},
"engines": {
"node": ">=12"
"node": ">=18"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
@ -31252,15 +31252,15 @@
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
"dev": true
"license": "MIT"
},
"node_modules/p-queue/node_modules/p-timeout": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz",
"integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==",
"dev": true,
"version": "6.1.4",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-6.1.4.tgz",
"integrity": "sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg==",
"license": "MIT",
"engines": {
"node": ">=12"
"node": ">=14.16"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
@ -44078,6 +44078,7 @@
"otplib": "^12.0.1",
"p-limit": "^2.3.0",
"p-props": "4.0.0",
"p-queue": "^8.1.0",
"parse-data-url": "^2.0.0",
"passport": "^0.6.0",
"passport-google-oauth20": "^2.0.0",
@ -44767,6 +44768,43 @@
"@uppy/core": "^3.8.0"
}
},
"services/web/node_modules/@uppy/provider-views/node_modules/eventemitter3": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
"dev": true,
"license": "MIT"
},
"services/web/node_modules/@uppy/provider-views/node_modules/p-queue": {
"version": "7.4.1",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz",
"integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==",
"dev": true,
"license": "MIT",
"dependencies": {
"eventemitter3": "^5.0.1",
"p-timeout": "^5.0.2"
},
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"services/web/node_modules/@uppy/provider-views/node_modules/p-timeout": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz",
"integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"services/web/node_modules/@uppy/react": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/@uppy/react/-/react-3.2.1.tgz",
@ -53118,6 +53156,7 @@
"overleaf-editor-core": "*",
"p-limit": "^2.3.0",
"p-props": "4.0.0",
"p-queue": "^8.1.0",
"parse-data-url": "^2.0.0",
"passport": "^0.6.0",
"passport-google-oauth20": "^2.0.0",
@ -53574,6 +53613,30 @@
"nanoid": "^4.0.0",
"p-queue": "^7.3.4",
"preact": "^10.5.13"
},
"dependencies": {
"eventemitter3": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
"dev": true
},
"p-queue": {
"version": "7.4.1",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz",
"integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==",
"dev": true,
"requires": {
"eventemitter3": "^5.0.1",
"p-timeout": "^5.0.2"
}
},
"p-timeout": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz",
"integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==",
"dev": true
}
}
},
"@uppy/react": {
@ -72418,26 +72481,23 @@
}
},
"p-queue": {
"version": "7.4.1",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-7.4.1.tgz",
"integrity": "sha512-vRpMXmIkYF2/1hLBKisKeVYJZ8S2tZ0zEAmIJgdVKP2nq0nh4qCdf8bgw+ZgKrkh71AOCaqzwbJJk1WtdcF3VA==",
"dev": true,
"version": "8.1.0",
"resolved": "https://registry.npmjs.org/p-queue/-/p-queue-8.1.0.tgz",
"integrity": "sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==",
"requires": {
"eventemitter3": "^5.0.1",
"p-timeout": "^5.0.2"
"p-timeout": "^6.1.2"
},
"dependencies": {
"eventemitter3": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
"dev": true
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA=="
},
"p-timeout": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.1.0.tgz",
"integrity": "sha512-auFDyzzzGZZZdHz3BtET9VEz0SE/uMEAx7uWfGPucfzEwwe/xH0iVeZibQmANYE/hp9T2+UUZT5m+BKyrDp3Ew==",
"dev": true
"version": "6.1.4",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-6.1.4.tgz",
"integrity": "sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg=="
}
}
},

View file

@ -107,7 +107,14 @@ module.exports = {
},
plugins: ['unicorn'],
rules: {
'import/no-unresolved': 'error',
'import/no-unresolved': [
'error',
{
// eslint-plugin-import does not support exports directive in package.json
// https://github.com/import-js/eslint-plugin-import/issues/1810
ignore: ['^p-queue$'],
},
],
'import/extensions': [
'error',
'ignorePackages',

View file

@ -146,6 +146,7 @@
"otplib": "^12.0.1",
"p-limit": "^2.3.0",
"p-props": "4.0.0",
"p-queue": "^8.1.0",
"parse-data-url": "^2.0.0",
"passport": "^0.6.0",
"passport-google-oauth20": "^2.0.0",

View file

@ -0,0 +1,221 @@
// @ts-check
import minimist from 'minimist'
import PQueue from 'p-queue'
import {
db,
ObjectId,
READ_PREFERENCE_SECONDARY,
} from '../app/src/infrastructure/mongodb.js'
import DocstoreManager from '../app/src/Features/Docstore/DocstoreManager.js'
import { NotFoundError } from '../app/src/Features/Errors/Errors.js'
const OPTS = parseArgs()
function parseArgs() {
const args = minimist(process.argv.slice(2), {
string: ['min-project-id', 'max-project-id', 'project-modified-since'],
boolean: ['help', 'dangling-comments', 'tracked-changes'],
})
if (args.help) {
usage()
process.exit(0)
}
const danglingComments = Boolean(args['dangling-comments'])
const trackedChanges = Boolean(args['tracked-changes'])
if (!danglingComments && !trackedChanges) {
console.log(
'At least one of --dangling-comments or --tracked-changes must be enabled'
)
process.exit(1)
}
return {
minProjectId: args['min-project-id'] ?? null,
maxProjectId: args['max-project-id'] ?? null,
projectModifiedSince: args['project-modified-since']
? new Date(args['project-modified-since'])
: null,
danglingComments,
trackedChanges,
concurrency: parseInt(args.concurrency ?? '1', 10),
}
}
function usage() {
console.log(`Usage: find_dangling_comments.mjs [OPTS]
Options:
--min-project-id Start scanning at this project id
--max-project-id Stop scanning at this project id
--project-modified-since Only consider projects that were modified after the given date
Example: 2020-01-01
--dangling-comments Report projects with dangling comments
--tracked-changes Report projects with tracked changes
--concurrency How many projects can be processed in parallel
`)
}
async function main() {
const queue = new PQueue({ concurrency: OPTS.concurrency })
let projectsProcessed = 0
let danglingCommentsFound = 0
let trackedChangesFound = 0
for await (const projectId of getProjectIds()) {
await queue.onEmpty()
queue.add(async () => {
const docs = await getDocs(projectId)
if (OPTS.danglingComments) {
const danglingThreadIds = await findDanglingThreadIds(projectId, docs)
if (danglingThreadIds.length > 0) {
console.log(
`Project ${projectId} has dangling threads: ${danglingThreadIds.join(', ')}`
)
danglingCommentsFound += 1
}
}
if (OPTS.trackedChanges) {
if (docsHaveTrackedChanges(docs)) {
console.log(`Project ${projectId} has tracked changes`)
trackedChangesFound += 1
}
}
projectsProcessed += 1
if (projectsProcessed % 100000 === 0) {
console.log(
`${projectsProcessed} projects processed - Last project: ${projectId}`
)
}
})
}
await queue.onIdle()
if (OPTS.danglingComments) {
console.log(
`${danglingCommentsFound} projects with dangling comments found`
)
}
if (OPTS.trackedChanges) {
console.log(`${trackedChangesFound} projects with tracked changes found`)
}
}
function getProjectIds() {
const clauses = []
if (OPTS.minProjectId != null) {
clauses.push({ _id: { $gte: new ObjectId(OPTS.minProjectId) } })
}
if (OPTS.maxProjectId != null) {
clauses.push({ _id: { $lte: new ObjectId(OPTS.maxProjectId) } })
}
if (OPTS.projectModifiedSince) {
clauses.push({ lastUpdated: { $gte: OPTS.projectModifiedSince } })
}
const query = clauses.length > 0 ? { $and: clauses } : {}
return db.projects
.find(query, {
projection: { _id: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
sort: { _id: 1 },
})
.map(x => x._id.toString())
}
async function getDocs(projectId) {
const mongoDocs = db.docs.find(
{
project_id: new ObjectId(projectId),
deleted: { $ne: true },
},
{
projection: { ranges: 1, inS3: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
}
)
const docs = []
for await (const mongoDoc of mongoDocs) {
if (mongoDoc.inS3) {
try {
const archivedDoc = await DocstoreManager.promises.getDoc(
projectId,
mongoDoc._id,
{ peek: true }
)
docs.push({
id: mongoDoc._id.toString(),
ranges: archivedDoc.ranges,
})
} catch (err) {
if (err instanceof NotFoundError) {
console.warn(`Doc ${mongoDoc._id} in project ${projectId} not found`)
} else {
throw err
}
}
} else {
docs.push({
id: mongoDoc._id.toString(),
ranges: mongoDoc.ranges,
})
}
}
return docs
}
async function findDanglingThreadIds(projectId, docs) {
const threadIds = new Set()
for (const doc of docs) {
const comments = doc.ranges?.comments ?? []
for (const comment of comments) {
threadIds.add(comment.op.t.toString())
}
}
if (threadIds.size === 0) {
return []
}
const rooms = await db.rooms.find(
{ project_id: new ObjectId(projectId), thread_id: { $exists: true } },
{ readPreference: READ_PREFERENCE_SECONDARY }
)
for await (const room of rooms) {
threadIds.delete(room.thread_id.toString())
if (threadIds.size === 0) {
break
}
}
return Array.from(threadIds)
}
function docsHaveTrackedChanges(docs) {
for (const doc of docs) {
const changes = doc.ranges?.changes ?? []
if (changes.length > 0) {
return true
}
}
return false
}
try {
await main()
process.exit(0)
} catch (err) {
console.error(err)
process.exit(1)
}

View file

@ -1,164 +0,0 @@
// @ts-check
import minimist from 'minimist'
import {
db,
ObjectId,
READ_PREFERENCE_SECONDARY,
} from '../app/src/infrastructure/mongodb.js'
import DocstoreManager from '../app/src/Features/Docstore/DocstoreManager.js'
import { NotFoundError } from '../app/src/Features/Errors/Errors.js'
const OPTS = parseArgs()
function parseArgs() {
const args = minimist(process.argv.slice(2), {
string: ['min-project-id', 'max-project-id', 'project-modified-since'],
boolean: ['help'],
})
if (args.help) {
usage()
process.exit(0)
}
return {
minProjectId: args['min-project-id'] ?? null,
maxProjectId: args['max-project-id'] ?? null,
projectModifiedSince: args['project-modified-since']
? new Date(args['project-modified-since'])
: null,
}
}
function usage() {
console.log(`Usage: find_dangling_comments.mjs [OPTS]
Options:
--min-project-id Start scanning at this project id
--max-project-id Stop scanning at this project id
--project-modified-since Only consider projects that were modified after the given date
Example: 2020-01-01`)
}
async function main() {
let projectsProcessed = 0
let projectsFound = 0
for await (const projectId of fetchProjectIds()) {
projectsProcessed += 1
const threadIds = await fetchThreadIds(projectId)
const danglingThreadIds = await findDanglingThreadIds(projectId, threadIds)
if (danglingThreadIds.length > 0) {
console.log(
`Project ${projectId} has dangling threads: ${danglingThreadIds.join(', ')}`
)
projectsFound += 1
}
if (projectsProcessed % 1000 === 0) {
console.log(
`${projectsProcessed} projects processed - Last project: ${projectId}`
)
}
}
console.log(`${projectsFound} projects with dangling comments found`)
}
function fetchProjectIds() {
const clauses = []
if (OPTS.minProjectId != null) {
clauses.push({ project_id: { $gte: new ObjectId(OPTS.minProjectId) } })
}
if (OPTS.maxProjectId != null) {
clauses.push({ project_id: { $lte: new ObjectId(OPTS.maxProjectId) } })
}
if (OPTS.projectModifiedSince) {
clauses.push({ lastUpdated: { $gte: OPTS.projectModifiedSince } })
}
const query = clauses.length > 0 ? { $and: clauses } : {}
return db.projects
.find(query, {
projection: { _id: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
})
.map(x => x._id.toString())
}
async function fetchThreadIds(projectId) {
const docs = db.docs.find(
{
project_id: new ObjectId(projectId),
deleted: { $ne: true },
$or: [{ 'ranges.comments.0': { $exists: true } }, { inS3: true }],
},
{
projection: { 'ranges.comments': 1, inS3: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
}
)
const threadIds = new Set()
for await (const doc of docs) {
let comments = []
if (doc.inS3) {
try {
const archivedDoc = await DocstoreManager.promises.getDoc(
projectId,
doc._id,
{ peek: true }
)
comments = archivedDoc.ranges?.comments ?? []
} catch (err) {
if (err instanceof NotFoundError) {
console.warn(`Doc ${doc._id} in project ${projectId} not found`)
} else {
throw err
}
}
} else {
comments = doc.ranges?.comments
}
for (const comment of comments) {
threadIds.add(comment.op.t.toString())
}
}
return threadIds
}
/**
* @param {string} projectId
* @param {Set<string>} threadIds
*/
async function findDanglingThreadIds(projectId, threadIds) {
const rooms = await db.rooms.find(
{ project_id: projectId, thread_id: { $exists: true } },
{ readPreference: READ_PREFERENCE_SECONDARY }
)
const existingThreadIds = new Set()
for await (const room of rooms) {
existingThreadIds.add(room.thread_id.toString())
}
const danglingThreadIds = []
for (const threadId of threadIds) {
if (!existingThreadIds.has(threadId)) {
danglingThreadIds.push(threadId)
}
}
return danglingThreadIds
}
try {
await main()
process.exit(0)
} catch (err) {
console.error(err)
process.exit(1)
}