Merge pull request #7579 from overleaf/jpa-refactor-orphaned-data-helper

[web] de-duplicate logic for get a list of hard deleted project ids

GitOrigin-RevId: daf2ff427e24f9ef9253e4bc9ff52f53196fd854
This commit is contained in:
Jakob Ackermann 2022-04-25 10:18:05 +01:00 committed by Copybot
parent f1f9771150
commit 0120268c57
3 changed files with 125 additions and 191 deletions

View file

@ -13,11 +13,11 @@ process.env.BATCH_SIZE = BATCH_SIZE
process.env.MONGO_SOCKET_TIMEOUT =
parseInt(process.env.MONGO_SOCKET_TIMEOUT, 10) || 600000
const { ObjectId, ReadPreference } = require('mongodb')
const { db } = require('../app/src/infrastructure/mongodb')
const { ObjectId } = require('mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises')
const { batchedUpdate } = require('./helpers/batchedUpdate')
const ChatApiHandler = require('../app/src/Features/Chat/ChatApiHandler')
const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper')
console.log({
DRY_RUN,
@ -36,11 +36,6 @@ async function processBatch(_, rooms) {
if (rooms.length && rooms[0]._id) {
RESULT.continueFrom = rooms[0]._id
}
// Logic taken from delete_orphaned_docs_online_check.js
// gets projectIds from rooms,
// then checks 'expired' status of project
const projectIds = Array.from(
new Set(rooms.map(room => room.project_id.toString()))
).map(ObjectId)
@ -49,35 +44,11 @@ async function processBatch(_, rooms) {
JSON.stringify(projectIds)
)
const doubleCheckProjectIdsOnPrimary = []
async function checkProjectOnSecondary(projectId) {
if (await checkProjectExistsOnSecondary(projectId)) {
// Finding a project with secondary confidence is sufficient.
return
}
// At this point, the secondaries deem this project as having orphaned chat.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const projectsWithOrphanedChat = []
async function checkProjectOnPrimary(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
projectsWithOrphanedChat.push(projectId)
}
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
const projectsWithOrphanedChat = await getHardDeletedProjectIds({
projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary,
checkProjectOnPrimary
)
READ_CONCURRENCY_SECONDARY,
})
console.log(
`Destroying chat for projects (${projectsWithOrphanedChat.length})`,
@ -104,72 +75,6 @@ async function processBatch(_, rooms) {
}
}
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function main() {
const projection = {
_id: 1,

View file

@ -0,0 +1,110 @@
const { ReadPreference } = require('mongodb')
const { db } = require('../app/src/infrastructure/mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises')
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function getHardDeletedProjectIds({
projectIds,
READ_CONCURRENCY_PRIMARY,
READ_CONCURRENCY_SECONDARY,
}) {
const doubleCheckProjectIdsOnPrimary = []
async function checkProjectOnSecondary(projectId) {
if (await checkProjectExistsOnSecondary(projectId)) {
// Finding a project with secondary confidence is sufficient.
return
}
// At this point, the secondaries deem this project as having orphaned docs.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const hardDeletedProjectIds = []
async function checkProjectOnPrimary(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
hardDeletedProjectIds.push(projectId)
}
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary,
checkProjectOnPrimary
)
return hardDeletedProjectIds
}
module.exports = {
getHardDeletedProjectIds,
}

View file

@ -3,6 +3,7 @@ const { promisify } = require('util')
const { ObjectId, ReadPreference } = require('mongodb')
const { db, waitForDb } = require('../app/src/infrastructure/mongodb')
const { promiseMapWithLimit } = require('../app/src/util/promises')
const { getHardDeletedProjectIds } = require('./delete_orphaned_data_helper')
const sleep = promisify(setTimeout)
const NOW_IN_S = Date.now() / 1000
@ -90,28 +91,6 @@ async function main() {
}
}
async function getDeletedProject(projectId, readPreference) {
return await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
// There is no index on .project. Pull down something small.
projection: { 'project._id': 1 },
readPreference,
}
)
}
async function getProject(projectId, readPreference) {
return await db.projects.findOne(
{ _id: projectId },
{
// Pulling down an empty object is fine for differentiating with null.
projection: { _id: 0 },
readPreference,
}
)
}
async function getProjectDocs(projectId) {
return await db.docs
.find(
@ -124,69 +103,15 @@ async function getProjectDocs(projectId) {
.toArray()
}
async function checkProjectExistsWithReadPreference(projectId, readPreference) {
// NOTE: Possible race conditions!
// There are two processes which are racing with our queries:
// 1. project deletion
// 2. project restoring
// For 1. we check the projects collection before deletedProjects.
// If a project were to be delete in this very moment, we should see the
// soft-deleted entry which is created before deleting the projects entry.
// For 2. we check the projects collection after deletedProjects again.
// If a project were to be restored in this very moment, it is very likely
// to see the projects entry again.
// Unlikely edge case: Restore+Deletion in rapid succession.
// We could add locking to the ProjectDeleter for ruling ^ out.
if (await getProject(projectId, readPreference)) {
// The project is live.
return true
}
const deletedProject = await getDeletedProject(projectId, readPreference)
if (deletedProject && deletedProject.project) {
// The project is registered for hard-deletion.
return true
}
if (await getProject(projectId, readPreference)) {
// The project was just restored.
return true
}
// The project does not exist.
return false
}
async function checkProjectExistsOnPrimary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.PRIMARY
)
}
async function checkProjectExistsOnSecondary(projectId) {
return await checkProjectExistsWithReadPreference(
projectId,
ReadPreference.SECONDARY
)
}
async function processBatch(projectIds) {
const doubleCheckProjectIdsOnPrimary = []
let nDeletedDocs = 0
async function checkProjectOnSecondary(projectId) {
if (await checkProjectExistsOnSecondary(projectId)) {
// Finding a project with secondary confidence is sufficient.
return
}
// At this point, the secondaries deem this project as having orphaned docs.
doubleCheckProjectIdsOnPrimary.push(projectId)
}
const projectsWithOrphanedDocs = await getHardDeletedProjectIds({
projectIds,
READ_CONCURRENCY_PRIMARY,
READ_CONCURRENCY_SECONDARY,
})
const projectsWithOrphanedDocs = []
async function checkProjectOnPrimary(projectId) {
if (await checkProjectExistsOnPrimary(projectId)) {
// The project is actually live.
return
}
projectsWithOrphanedDocs.push(projectId)
let nDeletedDocs = 0
async function countOrphanedDocs(projectId) {
const docs = await getProjectDocs(projectId)
nDeletedDocs += docs.length
console.log(
@ -196,16 +121,10 @@ async function processBatch(projectIds) {
JSON.stringify(docs.map(doc => doc._id))
)
}
await promiseMapWithLimit(
READ_CONCURRENCY_SECONDARY,
projectIds,
checkProjectOnSecondary
)
await promiseMapWithLimit(
READ_CONCURRENCY_PRIMARY,
doubleCheckProjectIdsOnPrimary,
checkProjectOnPrimary
projectsWithOrphanedDocs,
countOrphanedDocs
)
if (!DRY_RUN) {
await promiseMapWithLimit(