Merge pull request #22177 from overleaf/jpa-file-view-hash-1

[web] migrate file-view to download from history-v1 (via web) 1/2

GitOrigin-RevId: b787e90c57af5e2704b06ba63502aa6fc09ea1df
This commit is contained in:
Jakob Ackermann 2024-11-28 09:09:51 +01:00 committed by Copybot
parent 622d7bca79
commit ce0d5fd383
16 changed files with 468 additions and 44 deletions

View file

@ -198,19 +198,52 @@ async function createProjectBlob(req, res, next) {
})
}
async function headProjectBlob(req, res) {
const projectId = req.swagger.params.project_id.value
const hash = req.swagger.params.hash.value
const blobStore = new BlobStore(projectId)
const blob = await blobStore.getBlob(hash)
if (blob) {
res.set('Content-Length', blob.getByteLength())
res.status(200).end()
} else {
res.status(404).end()
}
}
// Support simple, singular ranges starting from zero only, up-to 2MB = 2_000_000, 7 digits
const RANGE_HEADER = /^bytes=0-(\d{1,7})$/
/**
* @param {string} header
* @return {{}|{start: number, end: number}}
* @private
*/
function _getRangeOpts(header) {
if (!header) return {}
const match = header.match(RANGE_HEADER)
if (match) {
const end = parseInt(match[1], 10)
return { start: 0, end }
}
return {}
}
async function getProjectBlob(req, res, next) {
const projectId = req.swagger.params.project_id.value
const hash = req.swagger.params.hash.value
const opts = _getRangeOpts(req.swagger.params.range.value || '')
const blobStore = new BlobStore(projectId)
logger.debug({ projectId, hash }, 'getProjectBlob started')
try {
let stream
try {
stream = await blobStore.getStream(hash)
stream = await blobStore.getStream(hash, opts)
} catch (err) {
if (err instanceof Blob.NotFoundError) {
return render.notFound(res)
return res.status(404).end()
} else {
throw err
}
@ -271,5 +304,6 @@ module.exports = {
deleteProject: expressify(deleteProject),
createProjectBlob: expressify(createProjectBlob),
getProjectBlob: expressify(getProjectBlob),
headProjectBlob: expressify(headProjectBlob),
copyProjectBlob: expressify(copyProjectBlob),
}

View file

@ -70,6 +70,52 @@ exports.paths = {
operationId: 'getProjectBlob',
tags: ['Project'],
description: 'Fetch blob content by its project id and hash.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'hash',
in: 'path',
description: 'Hexadecimal SHA-1 hash',
required: true,
type: 'string',
pattern: Blob.HEX_HASH_RX_STRING,
},
{
name: 'range',
in: 'header',
description: 'HTTP Range header',
required: false,
type: 'string',
},
],
produces: ['application/octet-stream'],
responses: {
200: {
description: 'Success',
schema: {
type: 'file',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
security: [{ jwt: [] }, { token: [] }],
},
head: {
'x-swagger-router-controller': 'projects',
operationId: 'headProjectBlob',
tags: ['Project'],
description: 'Fetch blob content-length by its project id and hash.',
parameters: [
{
name: 'project_id',

View file

@ -309,14 +309,15 @@ class BlobStore {
* failure, so the caller must be prepared to retry on errors, if appropriate.
*
* @param {string} hash hexadecimal SHA-1 hash
* @param {Object} opts
* @return {Promise.<Readable>} a stream to read the file
*/
async getStream(hash) {
async getStream(hash, opts = {}) {
assert.blobHash(hash, 'bad hash')
const { bucket, key } = getBlobLocation(this.projectId, hash)
try {
const stream = await persistor.getObjectStream(bucket, key)
const stream = await persistor.getObjectStream(bucket, key, opts)
return stream
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {

View file

@ -116,6 +116,33 @@ describe('Project blobs API', function () {
expect(payload).to.equal(fileContents.toString())
})
it('supports range request', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: 'bytes=0-4' } })
const payload = await response.text()
expect(payload).to.equal(fileContents.toString().slice(0, 4))
})
it('supports HEAD request', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { method: 'HEAD' })
expect(response.headers.get('Content-Length')).to.equal(
testFiles.HELLO_TXT_BYTE_LENGTH.toString()
)
const payload = await response.text()
expect(payload).to.have.length(0)
})
it('rejects an unautorized request', async function () {
const response = await fetch(
testServer.url(

View file

@ -16,6 +16,7 @@ import * as HistoryApiManager from './HistoryApiManager.js'
import * as RetryManager from './RetryManager.js'
import * as FlushManager from './FlushManager.js'
import { pipeline } from 'node:stream'
import { RequestFailedError } from '@overleaf/fetch-utils'
const ONE_DAY_IN_SECONDS = 24 * 60 * 60
@ -27,6 +28,9 @@ export function getProjectBlob(req, res, next) {
blobHash,
(err, stream) => {
if (err != null) {
if (err instanceof RequestFailedError && err.response.status === 404) {
return res.status(404).end()
}
return next(OError.tag(err))
}
res.setHeader('Cache-Control', `private, max-age=${ONE_DAY_IN_SECONDS}`)

View file

@ -13,10 +13,61 @@ const ProjectDetailsHandler = require('../Project/ProjectDetailsHandler')
const ProjectEntityUpdateHandler = require('../Project/ProjectEntityUpdateHandler')
const RestoreManager = require('./RestoreManager')
const { pipeline } = require('stream')
const Stream = require('stream')
const { prepareZipAttachment } = require('../../infrastructure/Response')
const Features = require('../../infrastructure/Features')
const { expressify } = require('@overleaf/promise-utils')
async function getBlob(req, res) {
await requestBlob('GET', req, res)
}
async function headBlob(req, res) {
await requestBlob('HEAD', req, res)
}
async function requestBlob(method, req, res) {
const { project_id: projectId, hash } = req.params
const range = req.get('Range')
let url, stream, source, contentLength
try {
;({ url, stream, source, contentLength } =
await HistoryManager.promises.requestBlobWithFallback(
projectId,
hash,
req.query.fallback,
method,
range
))
} catch (err) {
if (err instanceof Errors.NotFoundError) return res.status(404).end()
throw err
}
res.appendHeader('X-Served-By', source)
// allow the browser to cache these immutable files
// note: both "private" and "max-age" appear to be required for caching
res.setHeader('Cache-Control', 'private, max-age=3600')
if (contentLength) res.setHeader('Content-Length', contentLength) // set on HEAD
res.setHeader('Content-Type', 'application/octet-stream')
try {
await Stream.promises.pipeline(stream, res)
} catch (err) {
// If the downstream request is cancelled, we get an
// ERR_STREAM_PREMATURE_CLOSE, ignore these "errors".
if (err?.code === 'ERR_STREAM_PREMATURE_CLOSE') return
logger.warn({ err, url, method, range }, 'streaming blob error')
throw err
}
}
module.exports = HistoryController = {
getBlob: expressify(getBlob),
headBlob: expressify(headBlob),
proxyToHistoryApi(req, res, next) {
const userId = SessionManager.getLoggedInUserId(req.session)
const url = settings.apis.project_history.url + req.url
@ -38,30 +89,6 @@ module.exports = HistoryController = {
})
},
getBlob(req, res, next) {
const { project_id: projectId, blob } = req.params
ProjectGetter.getProject(
projectId,
{ 'overleaf.history.id': true },
(err, project) => {
if (err) return next(err)
const url = new URL(settings.apis.project_history.url)
url.pathname = `/project/${project.overleaf.history.id}/blob/${blob}`
pipeline(request(url.href), res, err => {
// If the downstream request is cancelled, we get an
// ERR_STREAM_PREMATURE_CLOSE.
if (err && err.code !== 'ERR_STREAM_PREMATURE_CLOSE') {
logger.warn({ url, err }, 'history API error')
next(err)
}
})
}
)
},
proxyToHistoryApiAndInjectUserDetails(req, res, next) {
const userId = SessionManager.getLoggedInUserId(req.session)
const url = settings.apis.project_history.url + req.url

View file

@ -1,11 +1,20 @@
const { callbackify } = require('util')
const { fetchJson, fetchNothing } = require('@overleaf/fetch-utils')
const {
fetchJson,
fetchNothing,
fetchStreamWithResponse,
RequestFailedError,
} = require('@overleaf/fetch-utils')
const fs = require('fs')
const settings = require('@overleaf/settings')
const OError = require('@overleaf/o-error')
const UserGetter = require('../User/UserGetter')
const ProjectGetter = require('../Project/ProjectGetter')
const HistoryBackupDeletionHandler = require('./HistoryBackupDeletionHandler')
const { ObjectId } = require('../../infrastructure/mongodb')
const Metrics = require('@overleaf/metrics')
const logger = require('@overleaf/logger')
const { NotFoundError } = require('../Errors/Errors')
async function initializeProject(projectId) {
const body = await fetchJson(`${settings.apis.project_history.url}/project`, {
@ -130,6 +139,65 @@ async function uploadBlobFromDisk(historyId, hash, byteLength, fsPath) {
})
}
async function requestBlobWithFallback(
projectId,
hash,
fileId,
method = 'GET',
range = ''
) {
const project = await ProjectGetter.promises.getProject(projectId, {
'overleaf.history.id': true,
})
// Talk to history-v1 directly to avoid streaming via project-history.
let url = new URL(settings.apis.v1_history.url)
url.pathname += `/projects/${project.overleaf.history.id}/blobs/${hash}`
const opts = { method, headers: { Range: range } }
let stream, response, source
try {
;({ stream, response } = await fetchStreamWithResponse(url, {
...opts,
basicAuth: {
user: settings.apis.v1_history.user,
password: settings.apis.v1_history.pass,
},
}))
source = 'history-v1'
} catch (err) {
if (err instanceof RequestFailedError && err.response.status === 404) {
if (ObjectId.isValid(fileId)) {
url = new URL(settings.apis.filestore.url)
url.pathname = `/project/${projectId}/file/${fileId}`
try {
;({ stream, response } = await fetchStreamWithResponse(url, opts))
} catch (err) {
if (
err instanceof RequestFailedError &&
err.response.status === 404
) {
throw new NotFoundError()
}
throw err
}
logger.warn({ projectId, hash, fileId }, 'missing history blob')
source = 'filestore'
} else {
throw new NotFoundError()
}
} else {
throw err
}
}
Metrics.inc('request_blob', 1, { path: source })
return {
url,
stream,
source,
contentLength: response.headers.get('Content-Length'),
}
}
/**
* Warning: Don't use this method for large projects. It will eagerly load all
* the history data and apply all operations.
@ -283,6 +351,7 @@ module.exports = {
injectUserDetails: callbackify(injectUserDetails),
getCurrentContent: callbackify(getCurrentContent),
uploadBlobFromDisk: callbackify(uploadBlobFromDisk),
requestBlobWithFallback: callbackify(requestBlobWithFallback),
promises: {
initializeProject,
flushProject,
@ -293,5 +362,6 @@ module.exports = {
getCurrentContent,
getContentAtVersion,
uploadBlobFromDisk,
requestBlobWithFallback,
},
}

View file

@ -143,6 +143,7 @@ async function uploadFile(req, res, next) {
success: true,
entity_id: entity?._id,
entity_type: entity?.type,
hash: entity?.hash,
})
}
}

View file

@ -550,6 +550,36 @@ async function initialize(webRouter, privateApiRouter, publicApiRouter) {
AuthorizationMiddleware.ensureUserCanReadProject,
FileStoreController.getFile
)
webRouter.head(
'/project/:project_id/blob/:hash',
validate({
params: Joi.object({
project_id: Joi.objectId().required(),
hash: Joi.string().required().hex().length(40),
}),
query: Joi.object({
fallback: Joi.objectId().optional(),
}),
}),
RateLimiterMiddleware.rateLimit(rateLimiters.getProjectBlob),
AuthorizationMiddleware.ensureUserCanReadProject,
HistoryController.headBlob
)
webRouter.get(
'/project/:project_id/blob/:hash',
validate({
params: Joi.object({
project_id: Joi.objectId().required(),
hash: Joi.string().required().hex().length(40),
}),
query: Joi.object({
fallback: Joi.objectId().optional(),
}),
}),
RateLimiterMiddleware.rateLimit(rateLimiters.getProjectBlob),
AuthorizationMiddleware.ensureUserCanReadProject,
HistoryController.getBlob
)
webRouter.get(
'/Project/:Project_id/doc/:Doc_id/download', // "download" suffix to avoid conflict with private API route at doc/:doc_id
AuthorizationMiddleware.ensureUserCanReadProject,

View file

@ -127,8 +127,8 @@ describe('RestoringFiles', function () {
project.rootFolder[0].fileRefs,
file => file.name === 'image.png'
)
file = MockFilestoreApi.files[this.project_id][file._id]
expect(file.content).to.equal(this.pngData)
file = MockFilestoreApi.getFile(this.project_id, file._id)
expect(file).to.deep.equal(this.pngData)
done()
})
})

View file

@ -0,0 +1,132 @@
import fs from 'node:fs'
import Path from 'node:path'
import { expect } from 'chai'
import UserHelper from './helpers/User.js'
import MockV1HistoryApiClass from './mocks/MockV1HistoryApi.js'
import ProjectGetter from '../../../app/src/Features/Project/ProjectGetter.js'
import MockFilestoreApiClass from './mocks/MockFilestoreApi.js'
import { fileURLToPath } from 'node:url'
import Metrics from './helpers/metrics.js'
const User = UserHelper.promises
let MockV1HistoryApi, MockFilestoreApi
before(function () {
MockV1HistoryApi = MockV1HistoryApiClass.instance()
MockFilestoreApi = MockFilestoreApiClass.instance()
})
const __dirname = fileURLToPath(new URL('.', import.meta.url))
const fileContent = fs.readFileSync(
Path.join(__dirname, '../files/2pixel.png'),
'utf-8'
)
describe('HistoryTests', function () {
let user, projectId, fileId, fileHash, fileURL, fileURLWithFallback
let historySource, filestoreSource
async function getSourceMetric(source) {
return await Metrics.promises.getMetric(
line => line.includes('request_blob') && line.includes(source)
)
}
beforeEach('create project', async function () {
user = new User()
await user.login()
projectId = await user.createProject('project1')
const project = await ProjectGetter.promises.getProject(projectId)
;({ entity_id: fileId, hash: fileHash } =
await user.uploadFileInProjectFull(
projectId,
project.rootFolder[0]._id.toString(),
'2pixel.png',
'2pixel.png',
'image/png'
))
fileURL = `/project/${projectId}/blob/${fileHash}`
fileURLWithFallback = `${fileURL}?fallback=${fileId}`
historySource = await getSourceMetric('history-v1')
filestoreSource = await getSourceMetric('filestore')
})
async function expectHistoryV1Hit() {
expect(await getSourceMetric('history-v1')).to.equal(historySource + 1)
expect(await getSourceMetric('filestore')).to.equal(filestoreSource)
}
async function expectFilestoreHit() {
expect(await getSourceMetric('history-v1')).to.equal(historySource)
expect(await getSourceMetric('filestore')).to.equal(filestoreSource + 1)
}
async function expectNoIncrement() {
expect(await getSourceMetric('history-v1')).to.equal(historySource)
expect(await getSourceMetric('filestore')).to.equal(filestoreSource)
}
describe('/project/:projectId/blob/:hash', function () {
describe('HEAD', function () {
it('should fetch the file size from history-v1', async function () {
const { response } = await user.doRequest('HEAD', fileURL)
expect(response.statusCode).to.equal(200)
expect(response.headers['x-served-by']).to.include('history-v1')
expect(response.headers['content-length']).to.equal('3694')
await expectHistoryV1Hit()
})
it('should return 404 without fallback', async function () {
MockV1HistoryApi.reset()
const { response } = await user.doRequest('HEAD', fileURL)
expect(response.statusCode).to.equal(404)
await expectNoIncrement()
})
it('should fetch the file size from filestore when missing in history-v1', async function () {
MockV1HistoryApi.reset()
const { response } = await user.doRequest('HEAD', fileURLWithFallback)
expect(response.statusCode).to.equal(200)
expect(response.headers['x-served-by']).to.include('filestore')
expect(response.headers['content-length']).to.equal('3694')
await expectFilestoreHit()
})
it('should return 404 with both files missing', async function () {
MockFilestoreApi.reset()
MockV1HistoryApi.reset()
const { response } = await user.doRequest('HEAD', fileURLWithFallback)
expect(response.statusCode).to.equal(404)
await expectNoIncrement()
})
})
describe('GET', function () {
it('should fetch the file from history-v1', async function () {
const { response, body } = await user.doRequest('GET', fileURL)
expect(response.statusCode).to.equal(200)
expect(response.headers['x-served-by']).to.include('history-v1')
expect(body).to.equal(fileContent)
await expectHistoryV1Hit()
})
it('should return 404 without fallback', async function () {
MockV1HistoryApi.reset()
const { response } = await user.doRequest('GET', fileURL)
expect(response.statusCode).to.equal(404)
await expectNoIncrement()
})
it('should fetch the file size from filestore when missing in history-v1', async function () {
MockV1HistoryApi.reset()
const { response, body } = await user.doRequest(
'GET',
fileURLWithFallback
)
expect(response.statusCode).to.equal(200)
expect(response.headers['x-served-by']).to.include('filestore')
expect(body).to.equal(fileContent)
await expectFilestoreHit()
})
it('should return 404 with both files missing', async function () {
MockFilestoreApi.reset()
MockV1HistoryApi.reset()
const { response } = await user.doRequest('GET', fileURLWithFallback)
expect(response.statusCode).to.equal(404)
await expectNoIncrement()
})
})
})
})

View file

@ -784,6 +784,24 @@ class User {
}
uploadFileInProject(projectId, folderId, file, name, contentType, callback) {
this.uploadFileInProjectFull(
projectId,
folderId,
file,
name,
contentType,
(err, body) => callback(err, body?.entity_id)
)
}
uploadFileInProjectFull(
projectId,
folderId,
file,
name,
contentType,
callback
) {
const fileStream = fs.createReadStream(
Path.resolve(Path.join(__dirname, '..', '..', 'files', file))
)
@ -819,7 +837,7 @@ class User {
)
}
callback(null, JSON.parse(body).entity_id)
callback(null, JSON.parse(body))
}
)
}

View file

@ -1,7 +1,4 @@
const AbstractMockApi = require('./AbstractMockApi')
const {
plainTextResponse,
} = require('../../../../app/src/infrastructure/Response')
class MockFilestoreApi extends AbstractMockApi {
reset() {
@ -14,27 +11,36 @@ class MockFilestoreApi extends AbstractMockApi {
req.on('data', chunk => chunks.push(chunk))
req.on('end', () => {
const content = Buffer.concat(chunks).toString()
const content = Buffer.concat(chunks)
const { projectId, fileId } = req.params
if (!this.files[projectId]) {
this.files[projectId] = {}
}
this.files[projectId][fileId] = { content }
this.files[projectId][fileId] = content
res.sendStatus(200)
})
})
this.app.head('/project/:projectId/file/:fileId', (req, res) => {
const { projectId, fileId } = req.params
const content = this.files[projectId]?.[fileId]
if (!content) return res.status(404).end()
res.set('Content-Length', content.byteLength)
res.status(200).end()
})
this.app.get('/project/:projectId/file/:fileId', (req, res) => {
const { projectId, fileId } = req.params
const { content } = this.files[projectId][fileId]
plainTextResponse(res, content)
const content = this.files[projectId]?.[fileId]
if (!content) return res.status(404).end()
res.status(200).end(content)
})
// handle file copying
this.app.put('/project/:projectId/file/:fileId', (req, res) => {
const { projectId, fileId } = req.params
const { source } = req.body
const { content } =
const content =
this.files[source.project_id] &&
this.files[source.project_id][source.file_id]
if (!content) {
@ -43,7 +49,7 @@ class MockFilestoreApi extends AbstractMockApi {
if (!this.files[projectId]) {
this.files[projectId] = {}
}
this.files[projectId][fileId] = { content }
this.files[projectId][fileId] = content
res.sendStatus(200)
}
})
@ -59,7 +65,7 @@ class MockFilestoreApi extends AbstractMockApi {
return (
this.files[projectId] &&
this.files[projectId][fileId] &&
this.files[projectId][fileId].content
this.files[projectId][fileId].toString()
)
}
}

View file

@ -11,6 +11,7 @@ class MockV1HistoryApi extends AbstractMockApi {
this.requestedZipPacks = 0
this.sentChunks = 0
this.events = new EventEmitter()
this.blobs = {}
}
applyRoutes() {
@ -79,8 +80,30 @@ class MockV1HistoryApi extends AbstractMockApi {
res.sendStatus(204)
})
this.app.put('/api/projects/:project_id/blobs/:hash', (req, res, next) => {
res.sendStatus(204)
this.app.put('/api/projects/:projectId/blobs/:hash', (req, res, next) => {
const chunks = []
req.on('data', chunk => chunks.push(chunk))
req.on('end', () => {
const { projectId, hash } = req.params
if (!this.blobs[projectId]) {
this.blobs[projectId] = {}
}
this.blobs[projectId][hash] = Buffer.concat(chunks)
res.sendStatus(200)
})
})
this.app.head('/api/projects/:projectId/blobs/:hash', (req, res, next) => {
const { projectId, hash } = req.params
const buf = this.blobs[projectId]?.[hash]
if (!buf) return res.status(404).end()
res.set('Content-Length', buf.byteLength)
res.status(200).end()
})
this.app.get('/api/projects/:projectId/blobs/:hash', (req, res, next) => {
const { projectId, hash } = req.params
const buf = this.blobs[projectId]?.[hash]
if (!buf) return res.status(404).end()
res.status(200).end(buf)
})
}
}

View file

@ -16,6 +16,7 @@ const Errors = require('../../../../app/src/Features/Errors/Errors')
const modulePath = '../../../../app/src/Features/History/HistoryController'
const SandboxedModule = require('sandboxed-module')
const { ObjectId } = require('mongodb-legacy')
describe('HistoryController', function () {
beforeEach(function () {
@ -31,6 +32,9 @@ describe('HistoryController', function () {
requires: {
request: (this.request = sinon.stub()),
'@overleaf/settings': (this.settings = {}),
'@overleaf/fetch-utils': {},
'@overleaf/Metrics': {},
'../../infrastructure/mongodb': { ObjectId },
stream: this.Stream,
'../Authentication/SessionManager': this.SessionManager,
'./HistoryManager': (this.HistoryManager = {}),

View file

@ -60,6 +60,7 @@ describe('HistoryManager', function () {
this.HistoryManager = SandboxedModule.require(MODULE_PATH, {
requires: {
'../../infrastructure/mongodb': { ObjectId },
'@overleaf/fetch-utils': this.FetchUtils,
'@overleaf/settings': this.settings,
'../User/UserGetter': this.UserGetter,