From 58fbbf6269675b2d31918172c18b85d90a549b4a Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Tue, 20 Apr 2021 14:21:32 +0200 Subject: [PATCH] Merge pull request #3925 from overleaf/jpa-cache-deleted-project [scripts] back_fill_dummy_doc_meta: add a cache for deletedProjects GitOrigin-RevId: 3d24c9fc4985f33b2e18f8f63f0b690d65476d1b --- services/web/package-lock.json | 80 +++++++++--- services/web/package.json | 1 + .../web/scripts/back_fill_dummy_doc_meta.js | 53 +++++--- .../src/BackFillDummyDocMetaTests.js | 114 +++++++++++++----- 4 files changed, 183 insertions(+), 65 deletions(-) diff --git a/services/web/package-lock.json b/services/web/package-lock.json index 5a7a5db47d..46df6af542 100644 --- a/services/web/package-lock.json +++ b/services/web/package-lock.json @@ -2247,6 +2247,16 @@ "gtoken": "^4.1.0", "jws": "^4.0.0", "lru-cache": "^5.0.0" + }, + "dependencies": { + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "requires": { + "yallist": "^3.0.2" + } + } } }, "google-p12-pem": { @@ -2382,6 +2392,11 @@ "version": "7.0.3", "resolved": "https://registry.npmjs.org/uuid/-/uuid-7.0.3.tgz", "integrity": "sha512-DPSke0pXhTZgoF/d+WSt2QaKMCFSfx7QegxEWT+JOuHF5aWrKEn0G+ztjuJg/gG8/ItK+rbPCD/yNv8yyih6Cg==" + }, + "yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==" } } }, @@ -9645,6 +9660,15 @@ "integrity": "sha512-a30VEBm4PEdx1dRB7MFK7BejejvCvBronbLjht+sHuGYj8PHs7M/5Z+rt5lw551vZ7yfTCj4Vuyy3mSJytDWRQ==", "dev": true }, + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, "rimraf": { "version": "2.7.1", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.7.1.tgz", @@ -9657,7 +9681,8 @@ "yallist": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", - "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==" + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true } } }, @@ -15445,7 +15470,7 @@ "functional-red-black-tree": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz", - "integrity": "sha512-dsKNQNdj6xA3T+QlADDA7mOSlX0qiMINjn0cgr+eGHGsbSHzTabcIogz2+p/iqP1Xs6EP/sS2SbqH+brGTbq0g==", + "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=", "dev": true }, "functions-have-names": { @@ -16119,6 +16144,16 @@ "gtoken": "^4.1.0", "jws": "^4.0.0", "lru-cache": "^5.0.0" + }, + "dependencies": { + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "requires": { + "yallist": "^3.0.2" + } + } } }, "google-p12-pem": { @@ -16173,6 +16208,11 @@ "version": "6.3.0", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + }, + "yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==" } } }, @@ -20022,18 +20062,11 @@ } }, "lru-cache": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", - "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", "requires": { - "yallist": "^3.0.2" - }, - "dependencies": { - "yallist": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", - "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==" - } + "yallist": "^4.0.0" } }, "lz-string": { @@ -21444,7 +21477,7 @@ "natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", - "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", + "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", "dev": true }, "ncp": { @@ -27249,7 +27282,7 @@ "require-like": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/require-like/-/require-like-0.1.2.tgz", - "integrity": "sha1-rW8wwTvs15cBDEaK+ndcDAprR/o=", + "integrity": "sha512-oyrU88skkMtDdauHDuKVrgR+zuItqr6/c//FXzvmxRGMexSDc6hNvJInGW3LL46n+8b50RykrvwSUIIQH2LQ5A==", "dev": true }, "require-main-filename": { @@ -30443,6 +30476,23 @@ "rimraf": "^2.7.1", "ssri": "^7.0.0", "unique-filename": "^1.1.1" + }, + "dependencies": { + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, + "yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true + } } }, "chownr": { diff --git a/services/web/package.json b/services/web/package.json index 1f647c83ab..626a3cc427 100644 --- a/services/web/package.json +++ b/services/web/package.json @@ -106,6 +106,7 @@ "jsonwebtoken": "^8.5.1", "lodash": "^4.17.19", "logger-sharelatex": "^2.2.0", + "lru-cache": "^6.0.0", "mailchimp-api-v3": "^1.12.0", "marked": "^0.3.5", "match-sorter": "^6.2.0", diff --git a/services/web/scripts/back_fill_dummy_doc_meta.js b/services/web/scripts/back_fill_dummy_doc_meta.js index cd2987cfab..09df7e8e42 100644 --- a/services/web/scripts/back_fill_dummy_doc_meta.js +++ b/services/web/scripts/back_fill_dummy_doc_meta.js @@ -7,6 +7,7 @@ const NOW_IN_S = Date.now() / 1000 const ONE_WEEK_IN_S = 60 * 60 * 24 * 7 const TEN_SECONDS = 10 * 1000 +const CACHE_SIZE = parseInt(process.env.CACHE_SIZE, 10) || 100 const DRY_RUN = process.env.DRY_RUN === 'true' if (!process.env.FIRST_PROJECT_ID) { console.error('Set FIRST_PROJECT_ID and re-run.') @@ -22,6 +23,11 @@ const LET_USER_DOUBLE_CHECK_INPUTS_FOR = const DUMMY_NAME = 'unknown.tex' const DUMMY_TIME = new Date('2021-04-12T00:00:00.000Z') +const LRUCache = require('lru-cache') +const deletedProjectsCache = new LRUCache({ + max: CACHE_SIZE +}) + function getSecondsFromObjectId(id) { return id.getTimestamp().getTime() / 1000 } @@ -30,21 +36,18 @@ async function main() { await letUserDoubleCheckInputs() await waitForDb() - let start = getSecondsFromObjectId(FIRST_PROJECT_ID) - // include the FIRST_PROJECT_ID in the first batch - start -= 1 + let startId = FIRST_PROJECT_ID let nProcessed = 0 - while (start < STOP_AT_S) { - let end = start + INCREMENT_BY_S - const startId = ObjectId.createFromTime(start) + while (getSecondsFromObjectId(startId) <= STOP_AT_S) { + const end = getSecondsFromObjectId(startId) + INCREMENT_BY_S let endId = ObjectId.createFromTime(end) const query = { project_id: { - // do not include edge - $gt: startId, // include edge - $lte: endId + $gte: startId, + // exclude edge + $lt: endId }, deleted: true, name: { @@ -65,27 +68,36 @@ async function main() { if (docs.length === BATCH_SIZE) { endId = docs[docs.length - 1].project_id - end = getSecondsFromObjectId(endId) } } console.error('Processed %d until %s', nProcessed, endId) - start = end + startId = endId } } +async function getDeletedProject(projectId) { + const cacheKey = projectId.toString() + if (deletedProjectsCache.has(cacheKey)) { + return deletedProjectsCache.get(cacheKey) + } + const deletedProject = await db.deletedProjects.findOne( + { 'deleterData.deletedProjectId': projectId }, + { + projection: { + _id: 1, + 'project.deletedDocs': 1 + } + } + ) + deletedProjectsCache.set(cacheKey, deletedProject) + return deletedProject +} + async function processBatch(docs) { for (const doc of docs) { const { _id: docId, project_id: projectId } = doc - const deletedProject = await db.deletedProjects.findOne( - { 'deleterData.deletedProjectId': projectId }, - { - projection: { - _id: 1, - 'project.deletedDocs': 1 - } - } - ) + const deletedProject = await getDeletedProject(projectId) let name = DUMMY_NAME let deletedAt = DUMMY_TIME if (deletedProject) { @@ -118,6 +130,7 @@ async function letUserDoubleCheckInputs() { JSON.stringify( { BATCH_SIZE, + CACHE_SIZE, DRY_RUN, FIRST_PROJECT_ID, INCREMENT_BY_S, diff --git a/services/web/test/acceptance/src/BackFillDummyDocMetaTests.js b/services/web/test/acceptance/src/BackFillDummyDocMetaTests.js index de28ed1d4f..2c51743de6 100644 --- a/services/web/test/acceptance/src/BackFillDummyDocMetaTests.js +++ b/services/web/test/acceptance/src/BackFillDummyDocMetaTests.js @@ -7,7 +7,7 @@ const { db, ObjectId } = require('../../../app/src/infrastructure/mongodb') const DUMMY_NAME = 'unknown.tex' const DUMMY_TIME = new Date('2021-04-12T00:00:00.000Z') const ONE_DAY_IN_S = 60 * 60 * 24 -const BATCH_SIZE = 2 +const BATCH_SIZE = 3 function getSecondsFromObjectId(id) { return id.getTimestamp().getTime() / 1000 @@ -34,6 +34,8 @@ describe('BackFillDummyDocMeta', function () { docIds[7] = getObjectIdFromDate('2021-04-16T00:01:00.000Z') docIds[8] = getObjectIdFromDate('2021-04-16T00:02:00.000Z') docIds[9] = getObjectIdFromDate('2021-04-16T00:03:00.000Z') + docIds[10] = getObjectIdFromDate('2021-04-16T00:04:00.000Z') + docIds[11] = getObjectIdFromDate('2021-04-16T00:05:00.000Z') projectIds = [] projectIds[0] = getObjectIdFromDate('2021-04-01T00:00:00.000Z') @@ -46,6 +48,9 @@ describe('BackFillDummyDocMeta', function () { projectIds[7] = getObjectIdFromDate('2021-04-16T00:01:00.000Z') projectIds[8] = getObjectIdFromDate('2021-04-16T00:02:00.000Z') projectIds[9] = getObjectIdFromDate('2021-04-16T00:03:00.000Z') + // two docs in the same project + projectIds[10] = projectIds[9] + projectIds[11] = projectIds[4] stopAtSeconds = new Date('2021-04-17T00:00:00.000Z').getTime() / 1000 }) @@ -72,7 +77,10 @@ describe('BackFillDummyDocMeta', function () { // multiple in a single batch { _id: docIds[7], project_id: projectIds[7], deleted: true }, { _id: docIds[8], project_id: projectIds[8], deleted: true }, - { _id: docIds[9], project_id: projectIds[9], deleted: true } + { _id: docIds[9], project_id: projectIds[9], deleted: true }, + // two docs in one project + { _id: docIds[10], project_id: projectIds[10], deleted: true }, + { _id: docIds[11], project_id: projectIds[11], deleted: true } ]) }) beforeEach('insert deleted project context', async function () { @@ -90,7 +98,10 @@ describe('BackFillDummyDocMeta', function () { { deleterData: { deletedProjectId: projectIds[4] }, project: { - deletedDocs: [{ _id: docIds[4], name: 'main.tex', deletedAt: now }] + deletedDocs: [ + { _id: docIds[4], name: 'main.tex', deletedAt: now }, + { _id: docIds[11], name: 'main.tex', deletedAt: now } + ] } } ]) @@ -100,6 +111,7 @@ describe('BackFillDummyDocMeta', function () { async function runScript(dryRun) { options = { BATCH_SIZE, + CACHE_SIZE: 100, DRY_RUN: dryRun, FIRST_PROJECT_ID: projectIds[0].toString(), INCREMENT_BY_S: ONE_DAY_IN_S, @@ -126,6 +138,34 @@ describe('BackFillDummyDocMeta', function () { .split('\n') .filter(line => !line.includes('Using settings from')) + const oneDayFromProjectId9InSeconds = + getSecondsFromObjectId(projectIds[9]) + ONE_DAY_IN_S + const oneDayFromProjectId9AsObjectId = getObjectIdFromDate( + 1000 * oneDayFromProjectId9InSeconds + ) + let overlappingPartStdOut + let overlappingPartStdErr + if (dryRun) { + // In dry-run, the previous id will get processed again as the name has not been updated. + overlappingPartStdOut = [ + `Back filling dummy meta data for ["${docIds[9]}","${docIds[10]}"]`, + `Orphaned deleted doc ${docIds[9]} (no deletedProjects entry)`, + `Orphaned deleted doc ${docIds[10]} (no deletedProjects entry)` + ] + overlappingPartStdErr = [ + `Processed 11 until ${oneDayFromProjectId9AsObjectId}` + ] + } else { + // Outside dry-run, the previous id will not match again as the `name` has been back-filled. + overlappingPartStdOut = [ + `Back filling dummy meta data for ["${docIds[10]}"]`, + `Orphaned deleted doc ${docIds[10]} (no deletedProjects entry)` + ] + overlappingPartStdErr = [ + `Processed 10 until ${oneDayFromProjectId9AsObjectId}` + ] + } + expect(stdOut).to.deep.equal([ `Back filling dummy meta data for ["${docIds[0]}"]`, `Orphaned deleted doc ${docIds[0]} (no deletedProjects entry)`, @@ -135,42 +175,40 @@ describe('BackFillDummyDocMeta', function () { `Orphaned deleted doc ${docIds[2]} (failed hard deletion)`, `Back filling dummy meta data for ["${docIds[3]}"]`, `Missing deletedDoc for ${docIds[3]}`, - `Back filling dummy meta data for ["${docIds[4]}"]`, + // two docs in the same project + `Back filling dummy meta data for ["${docIds[4]}","${docIds[11]}"]`, `Found deletedDoc for ${docIds[4]}`, + `Found deletedDoc for ${docIds[11]}`, // 7,8,9 are on the same day, but exceed the batch size of 2 - `Back filling dummy meta data for ["${docIds[7]}","${docIds[8]}"]`, + `Back filling dummy meta data for ["${docIds[7]}","${docIds[8]}","${docIds[9]}"]`, `Orphaned deleted doc ${docIds[7]} (no deletedProjects entry)`, `Orphaned deleted doc ${docIds[8]} (no deletedProjects entry)`, - `Back filling dummy meta data for ["${docIds[9]}"]`, `Orphaned deleted doc ${docIds[9]} (no deletedProjects entry)`, + // Potential double processing + ...overlappingPartStdOut, '' ]) - const oneDayFromProjectId8InSeconds = - getSecondsFromObjectId(projectIds[8]) + ONE_DAY_IN_S - const oneDayFromProjectId8AsObjectId = getObjectIdFromDate( - 1000 * oneDayFromProjectId8InSeconds - ) expect(stdErr).to.deep.equal([ ...`Options: ${JSON.stringify(options, null, 2)}`.split('\n'), 'Waiting for you to double check inputs for 1 ms', - `Processed 1 until ${getObjectIdFromDate('2021-04-01T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-02T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-03T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-04T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-05T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-06T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-07T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-08T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-09T23:59:59.000Z')}`, - `Processed 2 until ${getObjectIdFromDate('2021-04-10T23:59:59.000Z')}`, - `Processed 3 until ${getObjectIdFromDate('2021-04-11T23:59:59.000Z')}`, - `Processed 4 until ${getObjectIdFromDate('2021-04-12T23:59:59.000Z')}`, - `Processed 5 until ${getObjectIdFromDate('2021-04-13T23:59:59.000Z')}`, - `Processed 5 until ${getObjectIdFromDate('2021-04-14T23:59:59.000Z')}`, - `Processed 5 until ${getObjectIdFromDate('2021-04-15T23:59:59.000Z')}`, - // 7,8,9 are on the same day, but exceed the batch size of 2 - `Processed 7 until ${projectIds[8]}`, - `Processed 8 until ${oneDayFromProjectId8AsObjectId}`, + `Processed 1 until ${getObjectIdFromDate('2021-04-02T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-03T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-04T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-05T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-06T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-07T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-08T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-09T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-10T00:00:00.000Z')}`, + `Processed 2 until ${getObjectIdFromDate('2021-04-11T00:00:00.000Z')}`, + `Processed 3 until ${getObjectIdFromDate('2021-04-12T00:00:00.000Z')}`, + `Processed 4 until ${getObjectIdFromDate('2021-04-13T00:00:00.000Z')}`, + `Processed 6 until ${getObjectIdFromDate('2021-04-14T00:00:00.000Z')}`, + `Processed 6 until ${getObjectIdFromDate('2021-04-15T00:00:00.000Z')}`, + `Processed 6 until ${getObjectIdFromDate('2021-04-16T00:00:00.000Z')}`, + // 7,8,9,10 are on the same day, but exceed the batch size of 3 + `Processed 9 until ${projectIds[9]}`, + ...overlappingPartStdErr, 'Done.', '' ]) @@ -199,7 +237,9 @@ describe('BackFillDummyDocMeta', function () { { _id: docIds[6], project_id: projectIds[6] }, { _id: docIds[7], project_id: projectIds[7], deleted: true }, { _id: docIds[8], project_id: projectIds[8], deleted: true }, - { _id: docIds[9], project_id: projectIds[9], deleted: true } + { _id: docIds[9], project_id: projectIds[9], deleted: true }, + { _id: docIds[10], project_id: projectIds[10], deleted: true }, + { _id: docIds[11], project_id: projectIds[11], deleted: true } ]) }) }) @@ -275,6 +315,20 @@ describe('BackFillDummyDocMeta', function () { deleted: true, name: DUMMY_NAME, deletedAt: DUMMY_TIME + }, + { + _id: docIds[10], + project_id: projectIds[10], + deleted: true, + name: DUMMY_NAME, + deletedAt: DUMMY_TIME + }, + { + _id: docIds[11], + project_id: projectIds[11], + deleted: true, + name: 'main.tex', + deletedAt: now } ]) })