Merge pull request #3925 from overleaf/jpa-cache-deleted-project

[scripts] back_fill_dummy_doc_meta: add a cache for deletedProjects

GitOrigin-RevId: 3d24c9fc4985f33b2e18f8f63f0b690d65476d1b
This commit is contained in:
Jakob Ackermann 2021-04-20 14:21:32 +02:00 committed by Copybot
parent 35e9d4ea0b
commit 58fbbf6269
4 changed files with 183 additions and 65 deletions

View file

@ -2247,6 +2247,16 @@
"gtoken": "^4.1.0",
"jws": "^4.0.0",
"lru-cache": "^5.0.0"
},
"dependencies": {
"lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"requires": {
"yallist": "^3.0.2"
}
}
}
},
"google-p12-pem": {
@ -2382,6 +2392,11 @@
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-7.0.3.tgz",
"integrity": "sha512-DPSke0pXhTZgoF/d+WSt2QaKMCFSfx7QegxEWT+JOuHF5aWrKEn0G+ztjuJg/gG8/ItK+rbPCD/yNv8yyih6Cg=="
},
"yallist": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g=="
}
}
},
@ -9645,6 +9660,15 @@
"integrity": "sha512-a30VEBm4PEdx1dRB7MFK7BejejvCvBronbLjht+sHuGYj8PHs7M/5Z+rt5lw551vZ7yfTCj4Vuyy3mSJytDWRQ==",
"dev": true
},
"lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"dev": true,
"requires": {
"yallist": "^3.0.2"
}
},
"rimraf": {
"version": "2.7.1",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.7.1.tgz",
@ -9657,7 +9681,8 @@
"yallist": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g=="
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
"dev": true
}
}
},
@ -15445,7 +15470,7 @@
"functional-red-black-tree": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz",
"integrity": "sha512-dsKNQNdj6xA3T+QlADDA7mOSlX0qiMINjn0cgr+eGHGsbSHzTabcIogz2+p/iqP1Xs6EP/sS2SbqH+brGTbq0g==",
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
"dev": true
},
"functions-have-names": {
@ -16119,6 +16144,16 @@
"gtoken": "^4.1.0",
"jws": "^4.0.0",
"lru-cache": "^5.0.0"
},
"dependencies": {
"lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"requires": {
"yallist": "^3.0.2"
}
}
}
},
"google-p12-pem": {
@ -16173,6 +16208,11 @@
"version": "6.3.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
"integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
},
"yallist": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g=="
}
}
},
@ -20022,18 +20062,11 @@
}
},
"lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
"integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
"requires": {
"yallist": "^3.0.2"
},
"dependencies": {
"yallist": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g=="
}
"yallist": "^4.0.0"
}
},
"lz-string": {
@ -21444,7 +21477,7 @@
"natural-compare": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
"integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
"integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=",
"dev": true
},
"ncp": {
@ -27249,7 +27282,7 @@
"require-like": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/require-like/-/require-like-0.1.2.tgz",
"integrity": "sha1-rW8wwTvs15cBDEaK+ndcDAprR/o=",
"integrity": "sha512-oyrU88skkMtDdauHDuKVrgR+zuItqr6/c//FXzvmxRGMexSDc6hNvJInGW3LL46n+8b50RykrvwSUIIQH2LQ5A==",
"dev": true
},
"require-main-filename": {
@ -30443,6 +30476,23 @@
"rimraf": "^2.7.1",
"ssri": "^7.0.0",
"unique-filename": "^1.1.1"
},
"dependencies": {
"lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"dev": true,
"requires": {
"yallist": "^3.0.2"
}
},
"yallist": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
"integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
"dev": true
}
}
},
"chownr": {

View file

@ -106,6 +106,7 @@
"jsonwebtoken": "^8.5.1",
"lodash": "^4.17.19",
"logger-sharelatex": "^2.2.0",
"lru-cache": "^6.0.0",
"mailchimp-api-v3": "^1.12.0",
"marked": "^0.3.5",
"match-sorter": "^6.2.0",

View file

@ -7,6 +7,7 @@ const NOW_IN_S = Date.now() / 1000
const ONE_WEEK_IN_S = 60 * 60 * 24 * 7
const TEN_SECONDS = 10 * 1000
const CACHE_SIZE = parseInt(process.env.CACHE_SIZE, 10) || 100
const DRY_RUN = process.env.DRY_RUN === 'true'
if (!process.env.FIRST_PROJECT_ID) {
console.error('Set FIRST_PROJECT_ID and re-run.')
@ -22,6 +23,11 @@ const LET_USER_DOUBLE_CHECK_INPUTS_FOR =
const DUMMY_NAME = 'unknown.tex'
const DUMMY_TIME = new Date('2021-04-12T00:00:00.000Z')
const LRUCache = require('lru-cache')
const deletedProjectsCache = new LRUCache({
max: CACHE_SIZE
})
function getSecondsFromObjectId(id) {
return id.getTimestamp().getTime() / 1000
}
@ -30,21 +36,18 @@ async function main() {
await letUserDoubleCheckInputs()
await waitForDb()
let start = getSecondsFromObjectId(FIRST_PROJECT_ID)
// include the FIRST_PROJECT_ID in the first batch
start -= 1
let startId = FIRST_PROJECT_ID
let nProcessed = 0
while (start < STOP_AT_S) {
let end = start + INCREMENT_BY_S
const startId = ObjectId.createFromTime(start)
while (getSecondsFromObjectId(startId) <= STOP_AT_S) {
const end = getSecondsFromObjectId(startId) + INCREMENT_BY_S
let endId = ObjectId.createFromTime(end)
const query = {
project_id: {
// do not include edge
$gt: startId,
// include edge
$lte: endId
$gte: startId,
// exclude edge
$lt: endId
},
deleted: true,
name: {
@ -65,27 +68,36 @@ async function main() {
if (docs.length === BATCH_SIZE) {
endId = docs[docs.length - 1].project_id
end = getSecondsFromObjectId(endId)
}
}
console.error('Processed %d until %s', nProcessed, endId)
start = end
startId = endId
}
}
async function getDeletedProject(projectId) {
const cacheKey = projectId.toString()
if (deletedProjectsCache.has(cacheKey)) {
return deletedProjectsCache.get(cacheKey)
}
const deletedProject = await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
projection: {
_id: 1,
'project.deletedDocs': 1
}
}
)
deletedProjectsCache.set(cacheKey, deletedProject)
return deletedProject
}
async function processBatch(docs) {
for (const doc of docs) {
const { _id: docId, project_id: projectId } = doc
const deletedProject = await db.deletedProjects.findOne(
{ 'deleterData.deletedProjectId': projectId },
{
projection: {
_id: 1,
'project.deletedDocs': 1
}
}
)
const deletedProject = await getDeletedProject(projectId)
let name = DUMMY_NAME
let deletedAt = DUMMY_TIME
if (deletedProject) {
@ -118,6 +130,7 @@ async function letUserDoubleCheckInputs() {
JSON.stringify(
{
BATCH_SIZE,
CACHE_SIZE,
DRY_RUN,
FIRST_PROJECT_ID,
INCREMENT_BY_S,

View file

@ -7,7 +7,7 @@ const { db, ObjectId } = require('../../../app/src/infrastructure/mongodb')
const DUMMY_NAME = 'unknown.tex'
const DUMMY_TIME = new Date('2021-04-12T00:00:00.000Z')
const ONE_DAY_IN_S = 60 * 60 * 24
const BATCH_SIZE = 2
const BATCH_SIZE = 3
function getSecondsFromObjectId(id) {
return id.getTimestamp().getTime() / 1000
@ -34,6 +34,8 @@ describe('BackFillDummyDocMeta', function () {
docIds[7] = getObjectIdFromDate('2021-04-16T00:01:00.000Z')
docIds[8] = getObjectIdFromDate('2021-04-16T00:02:00.000Z')
docIds[9] = getObjectIdFromDate('2021-04-16T00:03:00.000Z')
docIds[10] = getObjectIdFromDate('2021-04-16T00:04:00.000Z')
docIds[11] = getObjectIdFromDate('2021-04-16T00:05:00.000Z')
projectIds = []
projectIds[0] = getObjectIdFromDate('2021-04-01T00:00:00.000Z')
@ -46,6 +48,9 @@ describe('BackFillDummyDocMeta', function () {
projectIds[7] = getObjectIdFromDate('2021-04-16T00:01:00.000Z')
projectIds[8] = getObjectIdFromDate('2021-04-16T00:02:00.000Z')
projectIds[9] = getObjectIdFromDate('2021-04-16T00:03:00.000Z')
// two docs in the same project
projectIds[10] = projectIds[9]
projectIds[11] = projectIds[4]
stopAtSeconds = new Date('2021-04-17T00:00:00.000Z').getTime() / 1000
})
@ -72,7 +77,10 @@ describe('BackFillDummyDocMeta', function () {
// multiple in a single batch
{ _id: docIds[7], project_id: projectIds[7], deleted: true },
{ _id: docIds[8], project_id: projectIds[8], deleted: true },
{ _id: docIds[9], project_id: projectIds[9], deleted: true }
{ _id: docIds[9], project_id: projectIds[9], deleted: true },
// two docs in one project
{ _id: docIds[10], project_id: projectIds[10], deleted: true },
{ _id: docIds[11], project_id: projectIds[11], deleted: true }
])
})
beforeEach('insert deleted project context', async function () {
@ -90,7 +98,10 @@ describe('BackFillDummyDocMeta', function () {
{
deleterData: { deletedProjectId: projectIds[4] },
project: {
deletedDocs: [{ _id: docIds[4], name: 'main.tex', deletedAt: now }]
deletedDocs: [
{ _id: docIds[4], name: 'main.tex', deletedAt: now },
{ _id: docIds[11], name: 'main.tex', deletedAt: now }
]
}
}
])
@ -100,6 +111,7 @@ describe('BackFillDummyDocMeta', function () {
async function runScript(dryRun) {
options = {
BATCH_SIZE,
CACHE_SIZE: 100,
DRY_RUN: dryRun,
FIRST_PROJECT_ID: projectIds[0].toString(),
INCREMENT_BY_S: ONE_DAY_IN_S,
@ -126,6 +138,34 @@ describe('BackFillDummyDocMeta', function () {
.split('\n')
.filter(line => !line.includes('Using settings from'))
const oneDayFromProjectId9InSeconds =
getSecondsFromObjectId(projectIds[9]) + ONE_DAY_IN_S
const oneDayFromProjectId9AsObjectId = getObjectIdFromDate(
1000 * oneDayFromProjectId9InSeconds
)
let overlappingPartStdOut
let overlappingPartStdErr
if (dryRun) {
// In dry-run, the previous id will get processed again as the name has not been updated.
overlappingPartStdOut = [
`Back filling dummy meta data for ["${docIds[9]}","${docIds[10]}"]`,
`Orphaned deleted doc ${docIds[9]} (no deletedProjects entry)`,
`Orphaned deleted doc ${docIds[10]} (no deletedProjects entry)`
]
overlappingPartStdErr = [
`Processed 11 until ${oneDayFromProjectId9AsObjectId}`
]
} else {
// Outside dry-run, the previous id will not match again as the `name` has been back-filled.
overlappingPartStdOut = [
`Back filling dummy meta data for ["${docIds[10]}"]`,
`Orphaned deleted doc ${docIds[10]} (no deletedProjects entry)`
]
overlappingPartStdErr = [
`Processed 10 until ${oneDayFromProjectId9AsObjectId}`
]
}
expect(stdOut).to.deep.equal([
`Back filling dummy meta data for ["${docIds[0]}"]`,
`Orphaned deleted doc ${docIds[0]} (no deletedProjects entry)`,
@ -135,42 +175,40 @@ describe('BackFillDummyDocMeta', function () {
`Orphaned deleted doc ${docIds[2]} (failed hard deletion)`,
`Back filling dummy meta data for ["${docIds[3]}"]`,
`Missing deletedDoc for ${docIds[3]}`,
`Back filling dummy meta data for ["${docIds[4]}"]`,
// two docs in the same project
`Back filling dummy meta data for ["${docIds[4]}","${docIds[11]}"]`,
`Found deletedDoc for ${docIds[4]}`,
`Found deletedDoc for ${docIds[11]}`,
// 7,8,9 are on the same day, but exceed the batch size of 2
`Back filling dummy meta data for ["${docIds[7]}","${docIds[8]}"]`,
`Back filling dummy meta data for ["${docIds[7]}","${docIds[8]}","${docIds[9]}"]`,
`Orphaned deleted doc ${docIds[7]} (no deletedProjects entry)`,
`Orphaned deleted doc ${docIds[8]} (no deletedProjects entry)`,
`Back filling dummy meta data for ["${docIds[9]}"]`,
`Orphaned deleted doc ${docIds[9]} (no deletedProjects entry)`,
// Potential double processing
...overlappingPartStdOut,
''
])
const oneDayFromProjectId8InSeconds =
getSecondsFromObjectId(projectIds[8]) + ONE_DAY_IN_S
const oneDayFromProjectId8AsObjectId = getObjectIdFromDate(
1000 * oneDayFromProjectId8InSeconds
)
expect(stdErr).to.deep.equal([
...`Options: ${JSON.stringify(options, null, 2)}`.split('\n'),
'Waiting for you to double check inputs for 1 ms',
`Processed 1 until ${getObjectIdFromDate('2021-04-01T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-02T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-03T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-04T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-05T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-06T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-07T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-08T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-09T23:59:59.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-10T23:59:59.000Z')}`,
`Processed 3 until ${getObjectIdFromDate('2021-04-11T23:59:59.000Z')}`,
`Processed 4 until ${getObjectIdFromDate('2021-04-12T23:59:59.000Z')}`,
`Processed 5 until ${getObjectIdFromDate('2021-04-13T23:59:59.000Z')}`,
`Processed 5 until ${getObjectIdFromDate('2021-04-14T23:59:59.000Z')}`,
`Processed 5 until ${getObjectIdFromDate('2021-04-15T23:59:59.000Z')}`,
// 7,8,9 are on the same day, but exceed the batch size of 2
`Processed 7 until ${projectIds[8]}`,
`Processed 8 until ${oneDayFromProjectId8AsObjectId}`,
`Processed 1 until ${getObjectIdFromDate('2021-04-02T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-03T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-04T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-05T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-06T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-07T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-08T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-09T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-10T00:00:00.000Z')}`,
`Processed 2 until ${getObjectIdFromDate('2021-04-11T00:00:00.000Z')}`,
`Processed 3 until ${getObjectIdFromDate('2021-04-12T00:00:00.000Z')}`,
`Processed 4 until ${getObjectIdFromDate('2021-04-13T00:00:00.000Z')}`,
`Processed 6 until ${getObjectIdFromDate('2021-04-14T00:00:00.000Z')}`,
`Processed 6 until ${getObjectIdFromDate('2021-04-15T00:00:00.000Z')}`,
`Processed 6 until ${getObjectIdFromDate('2021-04-16T00:00:00.000Z')}`,
// 7,8,9,10 are on the same day, but exceed the batch size of 3
`Processed 9 until ${projectIds[9]}`,
...overlappingPartStdErr,
'Done.',
''
])
@ -199,7 +237,9 @@ describe('BackFillDummyDocMeta', function () {
{ _id: docIds[6], project_id: projectIds[6] },
{ _id: docIds[7], project_id: projectIds[7], deleted: true },
{ _id: docIds[8], project_id: projectIds[8], deleted: true },
{ _id: docIds[9], project_id: projectIds[9], deleted: true }
{ _id: docIds[9], project_id: projectIds[9], deleted: true },
{ _id: docIds[10], project_id: projectIds[10], deleted: true },
{ _id: docIds[11], project_id: projectIds[11], deleted: true }
])
})
})
@ -275,6 +315,20 @@ describe('BackFillDummyDocMeta', function () {
deleted: true,
name: DUMMY_NAME,
deletedAt: DUMMY_TIME
},
{
_id: docIds[10],
project_id: projectIds[10],
deleted: true,
name: DUMMY_NAME,
deletedAt: DUMMY_TIME
},
{
_id: docIds[11],
project_id: projectIds[11],
deleted: true,
name: 'main.tex',
deletedAt: now
}
])
})