Merge pull request #7579 from overleaf/jpa-refactor-orphaned-data-helper

[web] de-duplicate logic for get a list of hard deleted project ids

GitOrigin-RevId: daf2ff427e24f9ef9253e4bc9ff52f53196fd854
This commit is contained in:
Jakob Ackermann 2022-04-25 10:18:05 +01:00 committed by Copybot
parent f1f9771150
commit 0120268c57
3 changed files with 125 additions and 191 deletions

View file

@ -13,11 +13,11 @@ process.env.BATCH_SIZE = BATCH_SIZE
process.env.MONGO_SOCKET_TIMEOUT = process.env.MONGO_SOCKET_TIMEOUT =
parseInt(process.env.MONGO_SOCKET_TIMEOUT, 10) || 600000 parseInt(process.env.MONGO_SOCKET_TIMEOUT, 10) || 600000
const { ObjectId, ReadPreference } = require('mongodb') const { ObjectId } = require('mongodb')
const { db } = require('../app/src/infrastructure/mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises') const { promiseMapWithLimit } = require('../app/src/util/promises')
const { batchedUpdate } = require('./helpers/batchedUpdate') const { batchedUpdate } = require('./helpers/batchedUpdate')
const ChatApiHandler = require('../app/src/Features/Chat/ChatApiHandler') const ChatApiHandler = require('../app/src/Features/Chat/ChatApiHandler')
const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper')
console.log({ console.log({
DRY_RUN, DRY_RUN,
@ -36,11 +36,6 @@ async function processBatch(_, rooms) {
if (rooms.length && rooms[0]._id) { if (rooms.length && rooms[0]._id) {
RESULT.continueFrom = rooms[0]._id RESULT.continueFrom = rooms[0]._id
} }
// Logic taken from delete_orphaned_docs_online_check.js
// gets projectIds from rooms,
// then checks 'expired' status of project
const projectIds = Array.from( const projectIds = Array.from(
new Set(rooms.map(room => room.project_id.toString())) new Set(rooms.map(room => room.project_id.toString()))
).map(ObjectId) ).map(ObjectId)
@ -49,35 +44,11 @@ async function processBatch(_, rooms) {
JSON.stringify(projectIds) JSON.stringify(projectIds)
) )
const doubleCheckProjectIdsOnPrimary = [] const projectsWithOrphanedChat = await getHardDeletedProjectIds({
async function checkProjectOnSecondary(projectId) {
if (await checkProjectExistsOnSecondary(projectId)) {
// Finding a project with secondary confidence is sufficient.
return
}
// At this point, the secondaries deem this project as having orphaned chat.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const projectsWithOrphanedChat = []
async function checkProjectOnPrimary(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
projectsWithOrphanedChat.push(projectId)
}
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
projectIds, projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY, READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary, READ_CONCURRENCY_SECONDARY,
checkProjectOnPrimary })
)
console.log( console.log(
`Destroying chat for projects (${projectsWithOrphanedChat.length})`, `Destroying chat for projects (${projectsWithOrphanedChat.length})`,
@ -104,72 +75,6 @@ async function processBatch(_, rooms) {
} }
} }
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function main() { async function main() {
const projection = { const projection = {
_id: 1, _id: 1,

View file

@ -0,0 +1,110 @@
const { ReadPreference } = require('mongodb')
const { db } = require('../app/src/infrastructure/mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises')
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function getHardDeletedProjectIds({
projectIds,
READ_CONCURRENCY_PRIMARY,
READ_CONCURRENCY_SECONDARY,
}) {
const doubleCheckProjectIdsOnPrimary = []
async function checkProjectOnSecondary(projectId) {
if (await checkProjectExistsOnSecondary(projectId)) {
// Finding a project with secondary confidence is sufficient.
return
}
// At this point, the secondaries deem this project as having orphaned docs.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const hardDeletedProjectIds = []
async function checkProjectOnPrimary(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
hardDeletedProjectIds.push(projectId)
}
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary,
checkProjectOnPrimary
)
return hardDeletedProjectIds
}
module.exports = {
getHardDeletedProjectIds,
}

View file

@ -3,6 +3,7 @@ const { promisify } = require('util')
const { ObjectId, ReadPreference } = require('mongodb') const { ObjectId, ReadPreference } = require('mongodb')
const { db, waitForDb } = require('../app/src/infrastructure/mongodb') const { db, waitForDb } = require('../app/src/infrastructure/mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises') const { promiseMapWithLimit } = require('../app/src/util/promises')
const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper')
const sleep = promisify(setTimeout) const sleep = promisify(setTimeout)
const NOW_IN_S = Date.now() / 1000 const NOW_IN_S = Date.now() / 1000
@ -90,28 +91,6 @@ async function main() {
} }
} }
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function getProjectDocs(projectId) { async function getProjectDocs(projectId) {
return await db.docs return await db.docs
.find( .find(
@ -124,69 +103,15 @@ async function getProjectDocs(projectId) {
.toArray() .toArray()
} }
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function processBatch(projectIds) { async function processBatch(projectIds) {
const doubleCheckProjectIdsOnPrimary = [] const projectsWithOrphanedDocs = await getHardDeletedProjectIds({
let nDeletedDocs = 0 projectIds,
async function checkProjectOnSecondary(projectId) { READ_CONCURRENCY_PRIMARY,
if (await checkProjectExistsOnSecondary(projectId)) { READ_CONCURRENCY_SECONDARY,
// Finding a project with secondary confidence is sufficient. })
return
}
// At this point, the secondaries deem this project as having orphaned docs.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const projectsWithOrphanedDocs = [] let nDeletedDocs = 0
async function checkProjectOnPrimary(projectId) { async function countOrphanedDocs(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
projectsWithOrphanedDocs.push(projectId)
const docs = await getProjectDocs(projectId) const docs = await getProjectDocs(projectId)
nDeletedDocs += docs.length nDeletedDocs += docs.length
console.log( console.log(
@ -196,16 +121,10 @@ async function processBatch(projectIds) {
JSON.stringify(docs.map(doc => doc._id)) JSON.stringify(docs.map(doc => doc._id))
) )
} }
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit( await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY, READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary, projectsWithOrphanedDocs,
checkProjectOnPrimary countOrphanedDocs
) )
if (!DRY_RUN) { if (!DRY_RUN) {
await promiseMapWithLimit( await promiseMapWithLimit(