mirror of
https://github.com/overleaf/overleaf.git
synced 2025-04-21 03:03:40 +00:00
Merge pull request #22134 from overleaf/jpa-broken-file-trees
[history-v1] back_fill_file_hash: gracefully handle broken file-trees GitOrigin-RevId: 463c785e98581364b107f3262951e7fa0fb88a0f
This commit is contained in:
parent
b7d37b434a
commit
40603e0561
2 changed files with 191 additions and 45 deletions
|
@ -823,14 +823,45 @@ async function updateFileRefInMongo(entry) {
|
|||
* @return Generator<QueueEntry>
|
||||
*/
|
||||
function* findFiles(ctx, folder, path, isInputLoop = false) {
|
||||
if (!folder || typeof folder !== 'object') {
|
||||
ctx.fileTreeBroken = true
|
||||
logger.warn({ projectId: ctx.projectId, path }, 'bad file-tree, bad folder')
|
||||
return
|
||||
}
|
||||
if (!Array.isArray(folder.folders)) {
|
||||
folder.folders = []
|
||||
ctx.fileTreeBroken = true
|
||||
logger.warn(
|
||||
{ projectId: ctx.projectId, path: `${path}.folders` },
|
||||
'bad file-tree, bad folders'
|
||||
)
|
||||
}
|
||||
let i = 0
|
||||
for (const child of folder.folders) {
|
||||
const idx = i++
|
||||
yield* findFiles(ctx, child, `${path}.folders.${idx}`, isInputLoop)
|
||||
}
|
||||
if (!Array.isArray(folder.fileRefs)) {
|
||||
folder.fileRefs = []
|
||||
ctx.fileTreeBroken = true
|
||||
logger.warn(
|
||||
{ projectId: ctx.projectId, path: `${path}.fileRefs` },
|
||||
'bad file-tree, bad fileRefs'
|
||||
)
|
||||
}
|
||||
i = 0
|
||||
for (const fileRef of folder.fileRefs) {
|
||||
const idx = i++
|
||||
const fileRefPath = `${path}.fileRefs.${idx}`
|
||||
if (!fileRef._id || !(fileRef._id instanceof ObjectId)) {
|
||||
ctx.fileTreeBroken = true
|
||||
logger.warn(
|
||||
{ projectId: ctx.projectId, path: fileRefPath },
|
||||
'bad file-tree, bad fileRef id'
|
||||
)
|
||||
continue
|
||||
}
|
||||
const fileId = fileRef._id.toString()
|
||||
if (PROCESS_HASHED_FILES && fileRef.hash) {
|
||||
if (ctx.canSkipProcessingHashedFile(fileRef.hash)) continue
|
||||
if (isInputLoop) {
|
||||
|
@ -840,7 +871,7 @@ function* findFiles(ctx, folder, path, isInputLoop = false) {
|
|||
yield {
|
||||
ctx,
|
||||
cacheKey: fileRef.hash,
|
||||
fileId: fileRef._id.toString(),
|
||||
fileId,
|
||||
path: MONGO_PATH_SKIP_WRITE_HASH_TO_FILE_TREE,
|
||||
hash: fileRef.hash,
|
||||
}
|
||||
|
@ -852,9 +883,9 @@ function* findFiles(ctx, folder, path, isInputLoop = false) {
|
|||
}
|
||||
yield {
|
||||
ctx,
|
||||
cacheKey: fileRef._id.toString(),
|
||||
fileId: fileRef._id.toString(),
|
||||
path: `${path}.fileRefs.${idx}`,
|
||||
cacheKey: fileId,
|
||||
fileId,
|
||||
path: fileRefPath,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -918,10 +949,14 @@ function* findFileInBatch(
|
|||
}
|
||||
}
|
||||
try {
|
||||
yield* findFiles(ctx, project.rootFolder[0], prefix, true)
|
||||
yield* findFiles(ctx, project.rootFolder?.[0], prefix, true)
|
||||
} catch (err) {
|
||||
STATS.badFileTrees++
|
||||
logger.error({ err, projectId: projectIdS }, 'bad file-tree')
|
||||
logger.error(
|
||||
{ err, projectId: projectIdS },
|
||||
'bad file-tree, processing error'
|
||||
)
|
||||
} finally {
|
||||
if (ctx.fileTreeBroken) STATS.badFileTrees++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1013,6 +1048,9 @@ class ProjectContext {
|
|||
/** @type {number} */
|
||||
remainingQueueEntries = 0
|
||||
|
||||
/** @type {boolean} */
|
||||
fileTreeBroken = false
|
||||
|
||||
/**
|
||||
* @param {ObjectId} projectId
|
||||
* @param {string} historyId
|
||||
|
|
|
@ -104,14 +104,20 @@ describe('back_fill_file_hash script', function () {
|
|||
const projectIdHardDeleted = objectIdFromTime('2017-01-01T00:08:00Z')
|
||||
const projectIdNoOverleaf = objectIdFromTime('2017-01-01T00:09:00Z')
|
||||
const projectIdNoOverleafDeleted = objectIdFromTime('2017-01-01T00:10:00Z')
|
||||
const projectIdBadFileTree = objectIdFromTime('2024-01-01T00:11:00Z')
|
||||
const projectIdBadFileTree0 = objectIdFromTime('2024-01-01T00:11:00Z')
|
||||
const projectIdBadFileTree1 = objectIdFromTime('2024-01-01T00:12:00Z')
|
||||
const projectIdBadFileTree2 = objectIdFromTime('2024-01-01T00:13:00Z')
|
||||
const projectIdBadFileTree3 = objectIdFromTime('2024-01-01T00:14:00Z')
|
||||
const historyId0 = 42 // stored as number is mongo
|
||||
const historyId1 = projectId1.toString()
|
||||
const historyId2 = projectId2.toString()
|
||||
const historyId3 = projectId3.toString()
|
||||
const historyIdDeleted0 = projectIdDeleted0.toString()
|
||||
const historyIdDeleted1 = projectIdDeleted1.toString()
|
||||
const historyIdBadFileTree = projectIdBadFileTree.toString()
|
||||
const historyIdBadFileTree0 = projectIdBadFileTree0.toString()
|
||||
const historyIdBadFileTree1 = projectIdBadFileTree1.toString()
|
||||
const historyIdBadFileTree2 = projectIdBadFileTree2.toString()
|
||||
const historyIdBadFileTree3 = projectIdBadFileTree3.toString()
|
||||
const fileId0 = objectIdFromTime('2017-02-01T00:00:00Z')
|
||||
const fileId1 = objectIdFromTime('2017-02-01T00:01:00Z')
|
||||
const fileId2 = objectIdFromTime('2017-02-01T00:02:00Z')
|
||||
|
@ -121,6 +127,7 @@ describe('back_fill_file_hash script', function () {
|
|||
const fileId6 = objectIdFromTime('2017-02-01T00:06:00Z')
|
||||
const fileId7 = objectIdFromTime('2017-02-01T00:07:00Z')
|
||||
const fileId8 = objectIdFromTime('2017-02-01T00:08:00Z')
|
||||
const fileId9 = objectIdFromTime('2017-02-01T00:09:00Z')
|
||||
const fileIdDeleted1 = objectIdFromTime('2017-03-01T00:01:00Z')
|
||||
const fileIdDeleted2 = objectIdFromTime('2017-03-01T00:02:00Z')
|
||||
const fileIdDeleted3 = objectIdFromTime('2017-03-01T00:03:00Z')
|
||||
|
@ -204,11 +211,16 @@ describe('back_fill_file_hash script', function () {
|
|||
hasHash: true,
|
||||
},
|
||||
{
|
||||
projectId: projectIdBadFileTree,
|
||||
historyId: historyIdBadFileTree,
|
||||
projectId: projectIdBadFileTree0,
|
||||
historyId: historyIdBadFileTree0,
|
||||
hash: hashTextBlob3,
|
||||
content: contentTextBlob3,
|
||||
},
|
||||
{
|
||||
projectId: projectIdBadFileTree3,
|
||||
historyId: historyIdBadFileTree3,
|
||||
fileId: fileId9,
|
||||
},
|
||||
]
|
||||
if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) {
|
||||
const fileIds = {
|
||||
|
@ -342,9 +354,28 @@ describe('back_fill_file_hash script', function () {
|
|||
rootFolder: [{ fileRefs: [], folders: [] }],
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree,
|
||||
_id: projectIdBadFileTree0,
|
||||
overleaf: { history: { id: historyIdBadFileTree0 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree1,
|
||||
rootFolder: [],
|
||||
overleaf: { history: { id: historyIdBadFileTree } },
|
||||
overleaf: { history: { id: historyIdBadFileTree1 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree2,
|
||||
rootFolder: [{ fileRefs: [{ _id: null }] }],
|
||||
overleaf: { history: { id: historyIdBadFileTree2 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree3,
|
||||
rootFolder: [
|
||||
{
|
||||
folders: [null, { folders: {}, fileRefs: 13 }],
|
||||
fileRefs: [{ _id: fileId9 }],
|
||||
},
|
||||
],
|
||||
overleaf: { history: { id: historyIdBadFileTree3 } },
|
||||
},
|
||||
])
|
||||
await deletedProjectsCollection.insertMany([
|
||||
|
@ -433,13 +464,18 @@ describe('back_fill_file_hash script', function () {
|
|||
{ _id: fileIdDeleted5, projectId: projectId0 },
|
||||
])
|
||||
|
||||
await testProjects.createEmptyProject(historyId0.toString())
|
||||
await testProjects.createEmptyProject(historyId1)
|
||||
await testProjects.createEmptyProject(historyId2)
|
||||
await testProjects.createEmptyProject(historyId3)
|
||||
await testProjects.createEmptyProject(historyIdDeleted0)
|
||||
await testProjects.createEmptyProject(historyIdDeleted1)
|
||||
await testProjects.createEmptyProject(historyIdBadFileTree)
|
||||
await Promise.all([
|
||||
testProjects.createEmptyProject(historyId0.toString()),
|
||||
testProjects.createEmptyProject(historyId1),
|
||||
testProjects.createEmptyProject(historyId2),
|
||||
testProjects.createEmptyProject(historyId3),
|
||||
testProjects.createEmptyProject(historyIdDeleted0),
|
||||
testProjects.createEmptyProject(historyIdDeleted1),
|
||||
testProjects.createEmptyProject(historyIdBadFileTree0),
|
||||
testProjects.createEmptyProject(historyIdBadFileTree1),
|
||||
testProjects.createEmptyProject(historyIdBadFileTree2),
|
||||
testProjects.createEmptyProject(historyIdBadFileTree3),
|
||||
])
|
||||
|
||||
const blobStore0 = new BlobStore(historyId0.toString())
|
||||
await blobStore0.putString(contentTextBlob0.toString())
|
||||
|
@ -447,7 +483,7 @@ describe('back_fill_file_hash script', function () {
|
|||
await blobStore1.putString(contentTextBlob1.toString())
|
||||
const blobStore2 = new BlobStore(historyId2.toString())
|
||||
await blobStore2.putString(contentTextBlob2.toString())
|
||||
const blobStoreBadFileTree = new BlobStore(historyIdBadFileTree.toString())
|
||||
const blobStoreBadFileTree = new BlobStore(historyIdBadFileTree0.toString())
|
||||
await blobStoreBadFileTree.putString(contentTextBlob3.toString())
|
||||
})
|
||||
|
||||
|
@ -518,6 +554,11 @@ describe('back_fill_file_hash script', function () {
|
|||
`${projectIdDeleted0}/${fileIdDeleted4}`,
|
||||
Stream.Readable.from([fileIdDeleted4.toString()])
|
||||
)
|
||||
await FILESTORE_PERSISTOR.sendStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
`${projectIdBadFileTree3}/${fileId9}`,
|
||||
Stream.Readable.from([fileId9.toString()])
|
||||
)
|
||||
})
|
||||
|
||||
/**
|
||||
|
@ -703,9 +744,28 @@ describe('back_fill_file_hash script', function () {
|
|||
rootFolder: [{ fileRefs: [], folders: [] }],
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree,
|
||||
_id: projectIdBadFileTree0,
|
||||
overleaf: { history: { id: historyIdBadFileTree0 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree1,
|
||||
rootFolder: [],
|
||||
overleaf: { history: { id: historyIdBadFileTree } },
|
||||
overleaf: { history: { id: historyIdBadFileTree1 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree2,
|
||||
rootFolder: [{ fileRefs: [{ _id: null }] }],
|
||||
overleaf: { history: { id: historyIdBadFileTree2 } },
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree3,
|
||||
rootFolder: [
|
||||
{
|
||||
folders: [null, { folders: {}, fileRefs: 13 }],
|
||||
fileRefs: [{ _id: fileId9, hash: gitBlobHash(fileId9) }],
|
||||
},
|
||||
],
|
||||
overleaf: { history: { id: historyIdBadFileTree3 } },
|
||||
},
|
||||
])
|
||||
expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([
|
||||
|
@ -881,9 +941,13 @@ describe('back_fill_file_hash script', function () {
|
|||
]
|
||||
: []),
|
||||
{
|
||||
_id: projectIdBadFileTree,
|
||||
_id: projectIdBadFileTree0,
|
||||
blobs: [binaryForGitBlobHash(hashTextBlob3)].sort(),
|
||||
},
|
||||
{
|
||||
_id: projectIdBadFileTree3,
|
||||
blobs: [binaryForGitBlobHash(gitBlobHash(fileId9))].sort(),
|
||||
},
|
||||
])
|
||||
})
|
||||
it('should process nothing on re-run', async function () {
|
||||
|
@ -895,10 +959,10 @@ describe('back_fill_file_hash script', function () {
|
|||
let stats = {
|
||||
...STATS_ALL_ZERO,
|
||||
// We still need to iterate over all the projects and blobs.
|
||||
projects: 7,
|
||||
blobs: 12,
|
||||
backedUpBlobs: 12,
|
||||
badFileTrees: 1,
|
||||
projects: 10,
|
||||
blobs: 13,
|
||||
backedUpBlobs: 13,
|
||||
badFileTrees: 4,
|
||||
}
|
||||
if (processHashedFiles) {
|
||||
stats = sumStats(stats, {
|
||||
|
@ -1042,11 +1106,11 @@ describe('back_fill_file_hash script', function () {
|
|||
writeToGCSEgress: 4000096,
|
||||
}
|
||||
const STATS_UP_FROM_PROJECT1_ONWARD = {
|
||||
projects: 5,
|
||||
projects: 8,
|
||||
blobs: 2,
|
||||
backedUpBlobs: 0,
|
||||
filesWithHash: 0,
|
||||
filesWithoutHash: 4,
|
||||
filesWithoutHash: 5,
|
||||
filesDuplicated: 0,
|
||||
filesRetries: 0,
|
||||
filesFailed: 0,
|
||||
|
@ -1056,18 +1120,18 @@ describe('back_fill_file_hash script', function () {
|
|||
projectDeleted: 0,
|
||||
projectHardDeleted: 0,
|
||||
fileHardDeleted: 0,
|
||||
badFileTrees: 1,
|
||||
mongoUpdates: 8,
|
||||
badFileTrees: 4,
|
||||
mongoUpdates: 10,
|
||||
deduplicatedWriteToAWSLocalCount: 1,
|
||||
deduplicatedWriteToAWSLocalEgress: 30,
|
||||
deduplicatedWriteToAWSRemoteCount: 0,
|
||||
deduplicatedWriteToAWSRemoteEgress: 0,
|
||||
readFromGCSCount: 6,
|
||||
readFromGCSIngress: 110,
|
||||
writeToAWSCount: 5,
|
||||
writeToAWSEgress: 143,
|
||||
writeToGCSCount: 3,
|
||||
writeToGCSEgress: 72,
|
||||
readFromGCSCount: 7,
|
||||
readFromGCSIngress: 134,
|
||||
writeToAWSCount: 6,
|
||||
writeToAWSEgress: 173,
|
||||
writeToGCSCount: 4,
|
||||
writeToGCSEgress: 96,
|
||||
}
|
||||
|
||||
function sumStats(a, b) {
|
||||
|
@ -1156,17 +1220,61 @@ describe('back_fill_file_hash script', function () {
|
|||
})
|
||||
})
|
||||
|
||||
/**
|
||||
* @param {ObjectId} projectId
|
||||
* @param {string} msg
|
||||
*/
|
||||
function expectBadFileTreeMessage(projectId, msg, path) {
|
||||
const line = output.result.stdout
|
||||
.split('\n')
|
||||
.find(l => l.includes(msg) && l.includes(projectId.toString()))
|
||||
expect(line).to.exist
|
||||
expect(JSON.parse(line)).to.include({
|
||||
projectId: projectId.toString(),
|
||||
msg,
|
||||
path,
|
||||
})
|
||||
}
|
||||
|
||||
it('should print stats', function () {
|
||||
expect(output.stats).deep.equal(STATS_ALL)
|
||||
})
|
||||
it('should have logged the bad file-tree', function () {
|
||||
const line = output.result.stdout
|
||||
.split('\n')
|
||||
.find(l => l.includes('bad file-tree'))
|
||||
expect(line).to.exist
|
||||
expect(JSON.parse(line)).to.include({
|
||||
projectId: projectIdBadFileTree.toString(),
|
||||
})
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree0,
|
||||
'bad file-tree, bad folder',
|
||||
'rootFolder.0'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree1,
|
||||
'bad file-tree, bad folder',
|
||||
'rootFolder.0'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree1,
|
||||
'bad file-tree, bad folder',
|
||||
'rootFolder.0'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree2,
|
||||
'bad file-tree, bad fileRef id',
|
||||
'rootFolder.0.fileRefs.0'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree3,
|
||||
'bad file-tree, bad folder',
|
||||
'rootFolder.0.folders.0'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree3,
|
||||
'bad file-tree, bad folders',
|
||||
'rootFolder.0.folders.1.folders'
|
||||
)
|
||||
expectBadFileTreeMessage(
|
||||
projectIdBadFileTree3,
|
||||
'bad file-tree, bad fileRefs',
|
||||
'rootFolder.0.folders.1.fileRefs'
|
||||
)
|
||||
})
|
||||
commonAssertions()
|
||||
})
|
||||
|
|
Loading…
Add table
Reference in a new issue