Merge pull request #22017 from overleaf/jpa-process-all-projects

[history-v1] back_fill_file_hash: process all projects

GitOrigin-RevId: 41c3cd59022bbac09552684ef2a99c58f2235ac3
This commit is contained in:
Jakob Ackermann 2024-11-20 15:20:09 +01:00 committed by Copybot
parent 1fca37af61
commit 2e630e50dc
2 changed files with 19 additions and 21 deletions

View file

@ -84,17 +84,11 @@ ObjectId.cacheHexString = true
const COLLECT_BLOBS = process.argv.includes('blobs')
// Time of closing the ticket for adding hashes: https://github.com/overleaf/internal/issues/464#issuecomment-492668129
const ALL_PROJECTS_HAVE_FILE_HASHES_AFTER = new Date('2019-05-15T14:02:00Z')
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const BATCH_RANGE_START =
process.env.BATCH_RANGE_START ||
ObjectId.createFromTime(PUBLIC_LAUNCH_DATE.getTime() / 1000).toString()
const BATCH_RANGE_END =
process.env.BATCH_RANGE_END ||
ObjectId.createFromTime(
ALL_PROJECTS_HAVE_FILE_HASHES_AFTER.getTime() / 1000
).toString()
const BATCH_RANGE_END = process.env.BATCH_RANGE_END || new ObjectId().toString()
// We need to control the start and end as ids of deleted projects are created at time of deletion.
delete process.env.BATCH_RANGE_START
delete process.env.BATCH_RANGE_END

View file

@ -168,7 +168,7 @@ describe('back_fill_file_hash script', function () {
{ projectId: projectId1, historyId: historyId1, fileId: fileId1 },
{ projectId: projectId1, historyId: historyId1, fileId: fileIdDeleted1 },
// { historyId: historyId2, fileId: fileId2 }, // already has hash
// { historyId: historyId3, fileId: fileId3 }, // too new
{ projectId: projectId3, historyId: historyId3, fileId: fileId3 },
{
projectId: projectIdDeleted0,
historyId: historyIdDeleted0,
@ -284,7 +284,7 @@ describe('back_fill_file_hash script', function () {
fileRefs: [],
folders: [
{
fileRefs: [{ _id: fileId3, hash: gitBlobHash(fileId3) }],
fileRefs: [{ _id: fileId3 }],
folders: [],
},
],
@ -782,6 +782,10 @@ describe('back_fill_file_hash script', function () {
binaryForGitBlobHash(gitBlobHash(fileIdDeleted2)),
].sort(),
},
{
_id: projectId3,
blobs: [binaryForGitBlobHash(gitBlobHash(fileId3))].sort(),
},
])
})
it('should process nothing on re-run', async function () {
@ -789,9 +793,9 @@ describe('back_fill_file_hash script', function () {
expect(rerun.stats).deep.equal({
...STATS_ALL_ZERO,
// We still need to iterate over all the projects and blobs.
projects: 4,
blobs: 10,
backedUpBlobs: 10,
projects: 6,
blobs: 11,
backedUpBlobs: 11,
})
})
it('should have backed up all the files', async function () {
@ -923,10 +927,10 @@ describe('back_fill_file_hash script', function () {
writeToGCSEgress: 4000096,
}
const STATS_UP_FROM_PROJECT1_ONWARD = {
projects: 2,
projects: 4,
blobs: 1,
backedUpBlobs: 0,
filesWithoutHash: 3,
filesWithoutHash: 4,
filesDuplicated: 0,
filesRetries: 0,
filesFailed: 0,
@ -936,17 +940,17 @@ describe('back_fill_file_hash script', function () {
projectDeleted: 0,
projectHardDeleted: 0,
fileHardDeleted: 0,
mongoUpdates: 5,
mongoUpdates: 7,
deduplicatedWriteToAWSLocalCount: 1,
deduplicatedWriteToAWSLocalEgress: 30,
deduplicatedWriteToAWSRemoteCount: 0,
deduplicatedWriteToAWSRemoteEgress: 0,
readFromGCSCount: 4,
readFromGCSIngress: 79,
writeToAWSCount: 3,
writeToAWSEgress: 85,
writeToGCSCount: 2,
writeToGCSEgress: 48,
readFromGCSCount: 5,
readFromGCSIngress: 103,
writeToAWSCount: 4,
writeToAWSEgress: 115,
writeToGCSCount: 3,
writeToGCSEgress: 72,
}
function sumStats(a, b) {