From 0120268c57b68c185f19d6a81ea546141dca0e80 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Mon, 25 Apr 2022 10:18:05 +0100 Subject: [PATCH] Merge pull request #7579 from overleaf/jpa-refactor-orphaned-data-helper [web] de-duplicate logic for get a list of hard deleted project ids GitOrigin-RevId: daf2ff427e24f9ef9253e4bc9ff52f53196fd854 --- .../scripts/delete_orphaned_chat_threads.js | 105 +---------------- .../scripts/delete_orphaned_data_helper.js | 110 ++++++++++++++++++ .../delete_orphaned_docs_online_check.js | 101 ++-------------- 3 files changed, 125 insertions(+), 191 deletions(-) create mode 100644 services/web/scripts/delete_orphaned_data_helper.js diff --git a/services/web/scripts/delete_orphaned_chat_threads.js b/services/web/scripts/delete_orphaned_chat_threads.js index 1c30544d3d..1643d43046 100644 --- a/services/web/scripts/delete_orphaned_chat_threads.js +++ b/services/web/scripts/delete_orphaned_chat_threads.js @@ -13,11 +13,11 @@ process.env.BATCH_SIZE = BATCH_SIZE process.env.MONGO_SOCKET_TIMEOUT = parseInt(process.env.MONGO_SOCKET_TIMEOUT, 10) || 600000 -const { ObjectId, ReadPreference } = require('mongodb') -const { db } = require('../app/src/infrastructure/mongodb') +const { ObjectId } = require('mongodb') const { promiseMapWithLimit } = require('../app/src/util/promises') const { batchedUpdate } = require('./helpers/batchedUpdate') const ChatApiHandler = require('../app/src/Features/Chat/ChatApiHandler') +const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper') console.log({ DRY_RUN, @@ -36,11 +36,6 @@ async function processBatch(_, rooms) { if (rooms.length && rooms[0]._id) { RESULT.continueFrom = rooms[0]._id } - - // Logic taken from delete_orphaned_docs_online_check.js - // gets projectIds from rooms, - // then checks 'expired' status of project - const projectIds = Array.from( new Set(rooms.map(room => room.project_id.toString())) ).map(ObjectId) @@ -49,35 +44,11 @@ async function processBatch(_, rooms) { JSON.stringify(projectIds) ) - const doubleCheckProjectIdsOnPrimary = [] - async function checkProjectOnSecondary(projectId) { - if (await checkProjectExistsOnSecondary(projectId)) { - // Finding a project with secondary confidence is sufficient. - return - } - // At this point, the secondaries deem this project as having orphaned chat. - doubleCheckProjectIdsOnPrimary.push(projectId) - } - - const projectsWithOrphanedChat = [] - async function checkProjectOnPrimary(projectId) { - if (await checkProjectExistsOnPrimary(projectId)) { - // The project is actually live. - return - } - projectsWithOrphanedChat.push(projectId) - } - - await promiseMapWithLimit( - READ_CONCURRENCY_SECONDARY, + const projectsWithOrphanedChat = await getHardDeletedProjectIds({ projectIds, - checkProjectOnSecondary - ) - await promiseMapWithLimit( READ_CONCURRENCY_PRIMARY, - doubleCheckProjectIdsOnPrimary, - checkProjectOnPrimary - ) + READ_CONCURRENCY_SECONDARY, + }) console.log( `Destroying chat for projects (${projectsWithOrphanedChat.length})`, @@ -104,72 +75,6 @@ async function processBatch(_, rooms) { } } -async function getDeletedProject(projectId, readPreference) { - return await db.deletedProjects.findOne( - { 'deleterData.deletedProjectId': projectId }, - { - // There is no index on .project. Pull down something small. - projection: { 'project._id': 1 }, - readPreference, - } - ) -} - -async function getProject(projectId, readPreference) { - return await db.projects.findOne( - { _id: projectId }, - { - // Pulling down an empty object is fine for differentiating with null. - projection: { _id: 0 }, - readPreference, - } - ) -} - -async function checkProjectExistsWithReadPreference(projectId, readPreference) { - // NOTE: Possible race conditions! - // There are two processes which are racing with our queries: - // 1. project deletion - // 2. project restoring - // For 1. we check the projects collection before deletedProjects. - // If a project were to be delete in this very moment, we should see the - // soft-deleted entry which is created before deleting the projects entry. - // For 2. we check the projects collection after deletedProjects again. - // If a project were to be restored in this very moment, it is very likely - // to see the projects entry again. - // Unlikely edge case: Restore+Deletion in rapid succession. - // We could add locking to the ProjectDeleter for ruling ^ out. - if (await getProject(projectId, readPreference)) { - // The project is live. - return true - } - const deletedProject = await getDeletedProject(projectId, readPreference) - if (deletedProject && deletedProject.project) { - // The project is registered for hard-deletion. - return true - } - if (await getProject(projectId, readPreference)) { - // The project was just restored. - return true - } - // The project does not exist. - return false -} - -async function checkProjectExistsOnPrimary(projectId) { - return await checkProjectExistsWithReadPreference( - projectId, - ReadPreference.PRIMARY - ) -} - -async function checkProjectExistsOnSecondary(projectId) { - return await checkProjectExistsWithReadPreference( - projectId, - ReadPreference.SECONDARY - ) -} - async function main() { const projection = { _id: 1, diff --git a/services/web/scripts/delete_orphaned_data_helper.js b/services/web/scripts/delete_orphaned_data_helper.js new file mode 100644 index 0000000000..c16cf32f2b --- /dev/null +++ b/services/web/scripts/delete_orphaned_data_helper.js @@ -0,0 +1,110 @@ +const { ReadPreference } = require('mongodb') +const { db } = require('../app/src/infrastructure/mongodb') +const { promiseMapWithLimit } = require('../app/src/util/promises') + +async function getDeletedProject(projectId, readPreference) { + return await db.deletedProjects.findOne( + { 'deleterData.deletedProjectId': projectId }, + { + // There is no index on .project. Pull down something small. + projection: { 'project._id': 1 }, + readPreference, + } + ) +} + +async function getProject(projectId, readPreference) { + return await db.projects.findOne( + { _id: projectId }, + { + // Pulling down an empty object is fine for differentiating with null. + projection: { _id: 0 }, + readPreference, + } + ) +} + +async function checkProjectExistsWithReadPreference(projectId, readPreference) { + // NOTE: Possible race conditions! + // There are two processes which are racing with our queries: + // 1. project deletion + // 2. project restoring + // For 1. we check the projects collection before deletedProjects. + // If a project were to be delete in this very moment, we should see the + // soft-deleted entry which is created before deleting the projects entry. + // For 2. we check the projects collection after deletedProjects again. + // If a project were to be restored in this very moment, it is very likely + // to see the projects entry again. + // Unlikely edge case: Restore+Deletion in rapid succession. + // We could add locking to the ProjectDeleter for ruling ^ out. + if (await getProject(projectId, readPreference)) { + // The project is live. + return true + } + const deletedProject = await getDeletedProject(projectId, readPreference) + if (deletedProject && deletedProject.project) { + // The project is registered for hard-deletion. + return true + } + if (await getProject(projectId, readPreference)) { + // The project was just restored. + return true + } + // The project does not exist. + return false +} + +async function checkProjectExistsOnPrimary(projectId) { + return await checkProjectExistsWithReadPreference( + projectId, + ReadPreference.PRIMARY + ) +} + +async function checkProjectExistsOnSecondary(projectId) { + return await checkProjectExistsWithReadPreference( + projectId, + ReadPreference.SECONDARY + ) +} + +async function getHardDeletedProjectIds({ + projectIds, + READ_CONCURRENCY_PRIMARY, + READ_CONCURRENCY_SECONDARY, +}) { + const doubleCheckProjectIdsOnPrimary = [] + async function checkProjectOnSecondary(projectId) { + if (await checkProjectExistsOnSecondary(projectId)) { + // Finding a project with secondary confidence is sufficient. + return + } + // At this point, the secondaries deem this project as having orphaned docs. + doubleCheckProjectIdsOnPrimary.push(projectId) + } + + const hardDeletedProjectIds = [] + async function checkProjectOnPrimary(projectId) { + if (await checkProjectExistsOnPrimary(projectId)) { + // The project is actually live. + return + } + hardDeletedProjectIds.push(projectId) + } + + await promiseMapWithLimit( + READ_CONCURRENCY_SECONDARY, + projectIds, + checkProjectOnSecondary + ) + await promiseMapWithLimit( + READ_CONCURRENCY_PRIMARY, + doubleCheckProjectIdsOnPrimary, + checkProjectOnPrimary + ) + return hardDeletedProjectIds +} + +module.exports = { + getHardDeletedProjectIds, +} diff --git a/services/web/scripts/delete_orphaned_docs_online_check.js b/services/web/scripts/delete_orphaned_docs_online_check.js index ba7a2bba9e..d3b1ab2a7d 100644 --- a/services/web/scripts/delete_orphaned_docs_online_check.js +++ b/services/web/scripts/delete_orphaned_docs_online_check.js @@ -3,6 +3,7 @@ const { promisify } = require('util') const { ObjectId, ReadPreference } = require('mongodb') const { db, waitForDb } = require('../app/src/infrastructure/mongodb') const { promiseMapWithLimit } = require('../app/src/util/promises') +const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper') const sleep = promisify(setTimeout) const NOW_IN_S = Date.now() / 1000 @@ -90,28 +91,6 @@ async function main() { } } -async function getDeletedProject(projectId, readPreference) { - return await db.deletedProjects.findOne( - { 'deleterData.deletedProjectId': projectId }, - { - // There is no index on .project. Pull down something small. - projection: { 'project._id': 1 }, - readPreference, - } - ) -} - -async function getProject(projectId, readPreference) { - return await db.projects.findOne( - { _id: projectId }, - { - // Pulling down an empty object is fine for differentiating with null. - projection: { _id: 0 }, - readPreference, - } - ) -} - async function getProjectDocs(projectId) { return await db.docs .find( @@ -124,69 +103,15 @@ async function getProjectDocs(projectId) { .toArray() } -async function checkProjectExistsWithReadPreference(projectId, readPreference) { - // NOTE: Possible race conditions! - // There are two processes which are racing with our queries: - // 1. project deletion - // 2. project restoring - // For 1. we check the projects collection before deletedProjects. - // If a project were to be delete in this very moment, we should see the - // soft-deleted entry which is created before deleting the projects entry. - // For 2. we check the projects collection after deletedProjects again. - // If a project were to be restored in this very moment, it is very likely - // to see the projects entry again. - // Unlikely edge case: Restore+Deletion in rapid succession. - // We could add locking to the ProjectDeleter for ruling ^ out. - if (await getProject(projectId, readPreference)) { - // The project is live. - return true - } - const deletedProject = await getDeletedProject(projectId, readPreference) - if (deletedProject && deletedProject.project) { - // The project is registered for hard-deletion. - return true - } - if (await getProject(projectId, readPreference)) { - // The project was just restored. - return true - } - // The project does not exist. - return false -} - -async function checkProjectExistsOnPrimary(projectId) { - return await checkProjectExistsWithReadPreference( - projectId, - ReadPreference.PRIMARY - ) -} - -async function checkProjectExistsOnSecondary(projectId) { - return await checkProjectExistsWithReadPreference( - projectId, - ReadPreference.SECONDARY - ) -} - async function processBatch(projectIds) { - const doubleCheckProjectIdsOnPrimary = [] - let nDeletedDocs = 0 - async function checkProjectOnSecondary(projectId) { - if (await checkProjectExistsOnSecondary(projectId)) { - // Finding a project with secondary confidence is sufficient. - return - } - // At this point, the secondaries deem this project as having orphaned docs. - doubleCheckProjectIdsOnPrimary.push(projectId) - } + const projectsWithOrphanedDocs = await getHardDeletedProjectIds({ + projectIds, + READ_CONCURRENCY_PRIMARY, + READ_CONCURRENCY_SECONDARY, + }) - const projectsWithOrphanedDocs = [] - async function checkProjectOnPrimary(projectId) { - if (await checkProjectExistsOnPrimary(projectId)) { - // The project is actually live. - return - } - projectsWithOrphanedDocs.push(projectId) + let nDeletedDocs = 0 + async function countOrphanedDocs(projectId) { const docs = await getProjectDocs(projectId) nDeletedDocs += docs.length console.log( @@ -196,16 +121,10 @@ async function processBatch(projectIds) { JSON.stringify(docs.map(doc => doc._id)) ) } - - await promiseMapWithLimit( - READ_CONCURRENCY_SECONDARY, - projectIds, - checkProjectOnSecondary - ) await promiseMapWithLimit( READ_CONCURRENCY_PRIMARY, - doubleCheckProjectIdsOnPrimary, - checkProjectOnPrimary + projectsWithOrphanedDocs, + countOrphanedDocs ) if (!DRY_RUN) { await promiseMapWithLimit(