[project-history] fixes for checking blob exists (#23051)

* [project-history] skip HEAD on blob without file-hash

* [project-history] only 404s indicate a missing blob

* [project-history] error checking for existing blob is fatal

* [project-history] always skip creating blob if already exists

Ignore createdBlob detail after performing check on blob. It might not
 exist on old operations that have been queued.

* [project-history] fix unit tests

* [project-history] migrate test to new fetch call

GitOrigin-RevId: 457963042e9d836647d3c122516329d4dc9b544f
This commit is contained in:
Jakob Ackermann 2025-01-23 09:35:16 +00:00 committed by Copybot
parent f1e1d55d8c
commit 2640d4a566
3 changed files with 360 additions and 7 deletions

View file

@ -247,6 +247,7 @@ function createBlobFromString(historyId, data, fileId, callback) {
}
function _checkBlobExists(historyId, hash, callback) {
if (!hash) return callback(null, false)
const url = `${Settings.overleaf.history.host}/projects/${historyId}/blobs/${hash}`
fetchNothing(url, {
method: 'HEAD',
@ -256,7 +257,7 @@ function _checkBlobExists(historyId, hash, callback) {
callback(null, true)
})
.catch(err => {
if (err instanceof RequestFailedError) {
if (err instanceof RequestFailedError && err.response.status === 404) {
return callback(null, false)
}
callback(OError.tag(err), false)
@ -337,11 +338,14 @@ export function createBlobForUpdate(projectId, historyId, update, callback) {
)
_checkBlobExists(historyId, update.hash, (err, blobExists) => {
if (err) {
logger.warn(
{ err, projectId, fileId, update },
'error checking whether blob exists, reading from filestore'
return callback(
new OError(
'error checking whether blob exists',
{ projectId, historyId, update },
err
)
)
} else if (update.createdBlob && blobExists) {
} else if (blobExists) {
logger.debug(
{ projectId, fileId, update },
'Skipping blob creation as it has already been created'

View file

@ -4,8 +4,10 @@ import { expect } from 'chai'
import request from 'request'
import assert from 'node:assert'
import mongodb from 'mongodb-legacy'
import logger from '@overleaf/logger'
import * as ProjectHistoryClient from './helpers/ProjectHistoryClient.js'
import * as ProjectHistoryApp from './helpers/ProjectHistoryApp.js'
import sinon from 'sinon'
const { ObjectId } = mongodb
const EMPTY_FILE_HASH = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
@ -16,6 +18,15 @@ const MockWeb = () => nock('http://127.0.0.1:3000')
describe('Syncing with web and doc-updater', function () {
const historyId = new ObjectId().toString()
let loggerWarn, loggerError
beforeEach(function () {
loggerWarn = sinon.spy(logger, 'warn')
loggerError = sinon.spy(logger, 'error')
})
afterEach(function () {
loggerWarn.restore()
loggerError.restore()
})
beforeEach(function (done) {
this.timestamp = new Date()
@ -202,7 +213,113 @@ describe('Syncing with web and doc-updater', function () {
MockFileStore()
.get(`/project/${this.project_id}/file/${this.file_id}`)
.reply(200, fileContents)
const headBlob = MockHistoryStore()
.head(`/api/projects/${historyId}/blobs/${fileHash}`)
.reply(404)
const createBlob = MockHistoryStore()
.put(`/api/projects/${historyId}/blobs/${fileHash}`, fileContents)
.reply(201)
const addFile = MockHistoryStore()
.post(`/api/projects/${historyId}/legacy_changes`, body => {
expect(body).to.deep.equal([
{
v2Authors: [],
authors: [],
timestamp: this.timestamp.toJSON(),
operations: [
{
pathname: 'test.png',
file: {
hash: fileHash,
},
},
],
origin: { kind: 'test-origin' },
},
])
return true
})
.query({ end_version: 0 })
.reply(204)
async.series(
[
cb => {
ProjectHistoryClient.resyncHistory(this.project_id, cb)
},
cb => {
const update = {
projectHistoryId: historyId,
resyncProjectStructure: {
docs: [],
files: [
{
file: this.file_id,
path: '/test.png',
_hash: fileHash,
url: `http://127.0.0.1:3009/project/${this.project_id}/file/${this.file_id}`,
},
{ path: '/persistedFile' },
],
},
meta: {
ts: this.timestamp,
},
}
ProjectHistoryClient.pushRawUpdate(this.project_id, update, cb)
},
cb => {
ProjectHistoryClient.flushProject(this.project_id, cb)
},
],
error => {
if (error) {
throw error
}
assert(!loggerWarn.called, 'no warning logged on 404')
assert(
headBlob.isDone(),
'HEAD /api/projects/:historyId/blobs/:hash should have been called'
)
assert(
createBlob.isDone(),
'/api/projects/:historyId/blobs/:hash should have been called'
)
assert(
addFile.isDone(),
`/api/projects/${historyId}/changes should have been called`
)
done()
}
)
})
it('should skip HEAD on blob without hash', function (done) {
MockHistoryStore()
.get(`/api/projects/${historyId}/latest/history`)
.reply(200, {
chunk: {
history: {
snapshot: {
files: {
persistedFile: { hash: EMPTY_FILE_HASH, byteLength: 0 },
},
},
changes: [],
},
startVersion: 0,
},
})
const fileContents = Buffer.from([1, 2, 3])
const fileHash = 'aed2973e4b8a7ff1b30ff5c4751e5a2b38989e74'
MockFileStore()
.get(`/project/${this.project_id}/file/${this.file_id}`)
.reply(200, fileContents)
const headBlob = MockHistoryStore()
.head(`/api/projects/${historyId}/blobs/undefined`)
.reply(500)
const createBlob = MockHistoryStore()
.put(`/api/projects/${historyId}/blobs/${fileHash}`, fileContents)
.reply(201)
@ -263,6 +380,11 @@ describe('Syncing with web and doc-updater', function () {
if (error) {
throw error
}
assert(!loggerWarn.called, 'no warning logged on 404')
assert(
!headBlob.isDone(),
'HEAD /api/projects/:historyId/blobs/:hash should have been skipped'
)
assert(
createBlob.isDone(),
'/api/projects/:historyId/blobs/:hash should have been called'
@ -275,6 +397,230 @@ describe('Syncing with web and doc-updater', function () {
}
)
})
it('should record error when checking blob fails with 500', function (done) {
MockHistoryStore()
.get(`/api/projects/${historyId}/latest/history`)
.reply(200, {
chunk: {
history: {
snapshot: {
files: {
persistedFile: { hash: EMPTY_FILE_HASH, byteLength: 0 },
},
},
changes: [],
},
startVersion: 0,
},
})
const fileContents = Buffer.from([1, 2, 3])
const fileHash = 'aed2973e4b8a7ff1b30ff5c4751e5a2b38989e74'
MockFileStore()
.get(`/project/${this.project_id}/file/${this.file_id}`)
.reply(200, fileContents)
const headBlob = MockHistoryStore()
.head(`/api/projects/${historyId}/blobs/${fileHash}`)
.reply(500)
const createBlob = MockHistoryStore()
.put(`/api/projects/${historyId}/blobs/${fileHash}`, fileContents)
.reply(201)
const addFile = MockHistoryStore()
.post(`/api/projects/${historyId}/legacy_changes`, body => {
expect(body).to.deep.equal([
{
v2Authors: [],
authors: [],
timestamp: this.timestamp.toJSON(),
operations: [
{
pathname: 'test.png',
file: {
hash: fileHash,
},
},
],
origin: { kind: 'test-origin' },
},
])
return true
})
.query({ end_version: 0 })
.reply(204)
async.series(
[
cb => {
ProjectHistoryClient.resyncHistory(this.project_id, cb)
},
cb => {
const update = {
projectHistoryId: historyId,
resyncProjectStructure: {
docs: [],
files: [
{
file: this.file_id,
path: '/test.png',
_hash: fileHash,
url: `http://127.0.0.1:3009/project/${this.project_id}/file/${this.file_id}`,
},
{ path: '/persistedFile' },
],
},
meta: {
ts: this.timestamp,
},
}
ProjectHistoryClient.pushRawUpdate(this.project_id, update, cb)
},
cb => {
ProjectHistoryClient.flushProject(
this.project_id,
{
allowErrors: true,
},
(err, res) => {
if (err) return cb(err)
assert(res.statusCode === 500, 'resync should have failed')
cb()
}
)
},
],
error => {
if (error) {
throw error
}
assert(
loggerError.calledWithMatch(
sinon.match.any,
'error checking whether blob exists'
),
'error logged on 500'
)
assert(
headBlob.isDone(),
'HEAD /api/projects/:historyId/blobs/:hash should have been called'
)
assert(
!createBlob.isDone(),
'/api/projects/:historyId/blobs/:hash should have been skipped'
)
assert(
!addFile.isDone(),
`/api/projects/${historyId}/changes should have been skipped`
)
done()
}
)
})
it('should skip blob write when blob exists', function (done) {
MockHistoryStore()
.get(`/api/projects/${historyId}/latest/history`)
.reply(200, {
chunk: {
history: {
snapshot: {
files: {
persistedFile: { hash: EMPTY_FILE_HASH, byteLength: 0 },
},
},
changes: [],
},
startVersion: 0,
},
})
const fileContents = Buffer.from([1, 2, 3])
const fileHash = 'aed2973e4b8a7ff1b30ff5c4751e5a2b38989e74'
MockFileStore()
.get(`/project/${this.project_id}/file/${this.file_id}`)
.reply(200, fileContents)
const headBlob = MockHistoryStore()
.head(`/api/projects/${historyId}/blobs/${fileHash}`)
.reply(200)
const createBlob = MockHistoryStore()
.put(`/api/projects/${historyId}/blobs/${fileHash}`, fileContents)
.reply(201)
const addFile = MockHistoryStore()
.post(`/api/projects/${historyId}/legacy_changes`, body => {
expect(body).to.deep.equal([
{
v2Authors: [],
authors: [],
timestamp: this.timestamp.toJSON(),
operations: [
{
pathname: 'test.png',
file: {
hash: fileHash,
},
},
],
origin: { kind: 'test-origin' },
},
])
return true
})
.query({ end_version: 0 })
.reply(204)
async.series(
[
cb => {
ProjectHistoryClient.resyncHistory(this.project_id, cb)
},
cb => {
const update = {
projectHistoryId: historyId,
resyncProjectStructure: {
docs: [],
files: [
{
file: this.file_id,
path: '/test.png',
_hash: fileHash,
url: `http://127.0.0.1:3009/project/${this.project_id}/file/${this.file_id}`,
},
{ path: '/persistedFile' },
],
},
meta: {
ts: this.timestamp,
},
}
ProjectHistoryClient.pushRawUpdate(this.project_id, update, cb)
},
cb => {
ProjectHistoryClient.flushProject(this.project_id, cb)
},
],
error => {
if (error) {
throw error
}
assert(!loggerWarn.called, 'no warning logged on 404')
assert(
headBlob.isDone(),
'HEAD /api/projects/:historyId/blobs/:hash should have been called'
)
assert(
!createBlob.isDone(),
'/api/projects/:historyId/blobs/:hash should have been skipped'
)
assert(
addFile.isDone(),
`/api/projects/${historyId}/changes should have been called`
)
done()
}
)
})
})
describe("when a file exists which shouldn't", function () {

View file

@ -57,7 +57,6 @@ describe('HistoryStoreManager', function () {
}
this.request = sinon.stub()
this.request.get = sinon.stub()
this.logger = {
debug: sinon.stub(),
@ -382,6 +381,9 @@ describe('HistoryStoreManager', function () {
this.fileStream = {}
this.hash = 'random-hash'
this.LocalFileWriter.bufferOnDisk.callsArgWith(4, null, this.hash)
this.FetchUtils.fetchNothing.rejects(
new RequestFailedError('', {}, { status: 404 })
)
this.FetchUtils.fetchStream.resolves(this.fileStream)
})
@ -443,7 +445,7 @@ describe('HistoryStoreManager', function () {
})
it('should not request the file from the filestore', function () {
expect(this.request.get).to.not.have.been.called
expect(this.FetchUtils.fetchStream).to.not.have.been.called
})
})
@ -503,6 +505,7 @@ describe('HistoryStoreManager', function () {
})
describe('when history-v1 confirms that the blob exists', function () {
beforeEach(function (done) {
this.FetchUtils.fetchNothing.resolves()
this.HistoryStoreManager.createBlobForUpdate(
this.projectId,
this.historyId,