Merge pull request #17359 from overleaf/mj-download-filtering

[overleaf-editor-core+history-v1] Filter tracked deletes when creating zip files

GitOrigin-RevId: 1c19d3cb849150d33e750772399ea81c280fdd57
This commit is contained in:
Mathias Jakobsen 2024-03-12 12:14:02 +00:00 committed by Copybot
parent 4ef7bc617b
commit ce19b5f568
10 changed files with 247 additions and 87 deletions

View file

@ -14,6 +14,7 @@ const StringFileData = require('./file_data/string_file_data')
* @typedef {import("./types").StringFileRawData} StringFileRawData
* @typedef {import("./types").CommentRawData} CommentRawData
* @typedef {import("./operation/text_operation")} TextOperation
* @typedef {{filterTrackedDeletes?: boolean}} FileGetContentOptions
*/
class NotEditableError extends OError {
@ -133,10 +134,11 @@ class File {
* The content of the file, if it is known and if this file has UTF-8 encoded
* content.
*
* @param {FileGetContentOptions} [opts]
* @return {string | null | undefined}
*/
getContent() {
return this.data.getContent()
getContent(opts = {}) {
return this.data.getContent(opts)
}
/**

View file

@ -90,6 +90,9 @@ class HashFileData extends FileData {
let rangesBlob
if (this.rangesHash) {
rangesBlob = await blobStore.getBlob(this.rangesHash)
if (!rangesBlob) {
throw new Error('Failed to look up rangesHash in blobStore')
}
}
if (!blob) throw new Error('blob not found: ' + this.hash)
return FileData.createLazyFromBlobs(blob, rangesBlob)
@ -102,6 +105,9 @@ class HashFileData extends FileData {
*/
async toHollow(blobStore) {
const blob = await blobStore.getBlob(this.hash)
if (!blob) {
throw new Error('Failed to look up hash in blobStore')
}
return FileData.createHollow(blob.getByteLength(), blob.getStringLength())
}

View file

@ -90,9 +90,10 @@ class FileData {
/**
* @see File#getContent
* @param {import('../file').FileGetContentOptions} [opts]
* @return {string | null | undefined}
*/
getContent() {
getContent(opts = {}) {
return null
}

View file

@ -66,9 +66,29 @@ class StringFileData extends FileData {
return true
}
/** @inheritdoc */
getContent() {
return this.content
/**
* @inheritdoc
* @param {import('../file').FileGetContentOptions} [opts]
*/
getContent(opts = {}) {
let content = ''
let cursor = 0
if (opts.filterTrackedDeletes) {
for (const tc of this.trackedChanges.trackedChanges) {
if (tc.tracking.type !== 'delete') {
continue
}
if (cursor < tc.range.start) {
content += this.content.slice(cursor, tc.range.start)
}
// skip the tracked change
cursor = tc.range.end
}
}
if (cursor < this.content.length) {
content += this.content.slice(cursor)
}
return content
}
/** @inheritdoc */

View file

@ -1,7 +1,7 @@
import Blob from './blob'
export type BlobStore = {
getBlob(hash: string): Promise<Blob>
getBlob(hash: string): Promise<Blob | null>
getString(hash: string): Promise<string>
putString(content: string): Promise<Blob>
putObject(obj: object): Promise<Blob>

View file

@ -101,4 +101,68 @@ describe('StringFileData', function () {
{ id: 'comm2', ranges: [{ pos: 20, length: 5 }], resolved: true },
])
})
it('getContent should filter out tracked deletions when passed option', function () {
const fileData = new StringFileData(
'the quick brown fox jumps over the lazy dog',
undefined,
[
{
range: { pos: 4, length: 6 },
tracking: {
type: 'delete',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
]
)
expect(fileData.getContent()).to.equal(
'the quick brown fox jumps over the lazy dog'
)
expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal(
'the brown fox jumps over the dog'
)
})
it('getContent should keep tracked insertions when passed option to remove tracked changes', function () {
const fileData = new StringFileData(
'the quick brown fox jumps over the lazy dog',
undefined,
[
{
range: { pos: 4, length: 6 },
tracking: {
type: 'insert',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
]
)
expect(fileData.getContent()).to.equal(
'the quick brown fox jumps over the lazy dog'
)
expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal(
'the quick brown fox jumps over the dog'
)
})
})

View file

@ -22,6 +22,8 @@ const postgresBackend = require('./postgres')
const mongoBackend = require('./mongo')
const logger = require('@overleaf/logger')
/** @typedef {import('stream').Readable} Readable */
const GLOBAL_BLOBS = new Map()
function makeGlobalKey(hash) {
@ -273,7 +275,7 @@ class BlobStore {
* failure, so the caller must be prepared to retry on errors, if appropriate.
*
* @param {string} hash hexadecimal SHA-1 hash
* @return {stream} a stream to read the file
* @return {Promise.<Readable>} a stream to read the file
*/
async getStream(hash) {
assert.blobHash(hash, 'bad hash')

View file

@ -1,16 +1,24 @@
// @ts-check
'use strict'
/** @typedef {import('overleaf-editor-core/types').Snapshot} Snapshot */
const Archive = require('archiver')
const BPromise = require('bluebird')
const fs = require('fs')
const { pipeline } = require('stream')
const core = require('overleaf-editor-core')
const Snapshot = core.Snapshot
const OError = require('@overleaf/o-error')
const assert = require('./assert')
/**
* @typedef {import('../../storage/lib/blob_store/index').BlobStore} BlobStore
*/
// The maximum safe concurrency appears to be 1.
// https://github.com/overleaf/issues/issues/1909
const FETCH_CONCURRENCY = 1 // number of files to fetch at once
@ -21,101 +29,113 @@ class DownloadError extends OError {
super(`ProjectArchive: blob download failed: ${hash}`, { hash })
}
}
ProjectArchive.DownloadError = DownloadError
class ArchiveTimeout extends OError {
constructor() {
super('ProjectArchive timed out')
}
}
ProjectArchive.ArchiveTimeout = ArchiveTimeout
/**
* @constructor
* @param {Snapshot} snapshot
* @param {?number} timeout in ms
* @classdesc
* Writes the project snapshot to a zip file.
*/
function ProjectArchive(snapshot, timeout) {
assert.instance(snapshot, Snapshot)
this.snapshot = snapshot
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
class MissingfileError extends OError {
constructor() {
super('ProjectArchive: attempting to look up a file that does not exist')
}
}
/**
* Write zip archive to the given file path.
*
* @param {BlobStore} blobStore
* @param {string} zipFilePath
*/
ProjectArchive.prototype.writeZip = function projectArchiveToZip(
blobStore,
zipFilePath
) {
const snapshot = this.snapshot
const timeout = this.timeout
class ProjectArchive {
static ArchiveTimeout = ArchiveTimeout
static MissingfileError = MissingfileError
static DownloadError = DownloadError
const startTime = process.hrtime()
const archive = new Archive('zip')
// Convert elapsed seconds and nanoseconds to milliseconds.
function findElapsedMilliseconds() {
const elapsed = process.hrtime(startTime)
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
/**
* @constructor
* @param {Snapshot} snapshot
* @param {?number} timeout in ms
* @classdesc
* Writes the project snapshot to a zip file.
*/
constructor(snapshot, timeout) {
assert.instance(snapshot, Snapshot)
this.snapshot = snapshot
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
}
function addFileToArchive(pathname) {
if (findElapsedMilliseconds() > timeout) {
throw new ProjectArchive.ArchiveTimeout()
/**
* Write zip archive to the given file path.
*
* @param {BlobStore} blobStore
* @param {string} zipFilePath
*/
writeZip(blobStore, zipFilePath) {
const snapshot = this.snapshot
const timeout = this.timeout
const startTime = process.hrtime()
const archive = new Archive('zip')
// Convert elapsed seconds and nanoseconds to milliseconds.
function findElapsedMilliseconds() {
const elapsed = process.hrtime(startTime)
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
}
const file = snapshot.getFile(pathname)
return file.load('eager', blobStore).then(function () {
const content = file.getContent()
if (content === null) {
return streamFileToArchive(pathname, file).catch(function (err) {
throw new ProjectArchive.DownloadError(file.getHash()).withCause(err)
})
} else {
archive.append(content, { name: pathname })
function addFileToArchive(pathname) {
if (findElapsedMilliseconds() > timeout) {
throw new ProjectArchive.ArchiveTimeout()
}
})
}
function streamFileToArchive(pathname, file) {
return new BPromise(function (resolve, reject) {
blobStore
.getStream(file.getHash())
.then(stream => {
stream.on('error', reject)
stream.on('end', resolve)
archive.append(stream, { name: pathname })
})
.catch(reject)
})
}
const addFilesToArchiveAndFinalize = BPromise.map(
snapshot.getFilePathnames(),
addFileToArchive,
{ concurrency: FETCH_CONCURRENCY }
).then(function () {
archive.finalize()
})
const streamArchiveToFile = new BPromise(function (resolve, reject) {
const stream = fs.createWriteStream(zipFilePath)
pipeline(archive, stream, function (err) {
if (err) {
reject(err)
} else {
resolve()
const file = snapshot.getFile(pathname)
if (!file) {
throw new ProjectArchive.MissingfileError()
}
})
})
return file.load('eager', blobStore).then(function () {
const content = file.getContent({ filterTrackedDeletes: true })
if (content === null) {
return streamFileToArchive(pathname, file).catch(function (err) {
throw new ProjectArchive.DownloadError(file.getHash()).withCause(
err
)
})
} else {
archive.append(content, { name: pathname })
}
})
}
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
function streamFileToArchive(pathname, file) {
return new BPromise(function (resolve, reject) {
blobStore
.getStream(file.getHash())
.then(stream => {
stream.on('error', reject)
stream.on('end', resolve)
archive.append(stream, { name: pathname })
})
.catch(reject)
})
}
const addFilesToArchiveAndFinalize = BPromise.map(
snapshot.getFilePathnames(),
addFileToArchive,
{ concurrency: FETCH_CONCURRENCY }
).then(function () {
archive.finalize()
})
const streamArchiveToFile = new BPromise(function (resolve, reject) {
const stream = fs.createWriteStream(zipFilePath)
pipeline(archive, stream, function (err) {
if (err) {
reject(err)
} else {
resolve()
}
})
})
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
}
}
module.exports = ProjectArchive

View file

@ -34,7 +34,7 @@ class ZipStore {
/**
* Generate signed link to access the zip file.
*
* @param {number} projectId
* @param {number | string} projectId
* @param {number} version
* @return {string}
*/
@ -49,7 +49,7 @@ class ZipStore {
/**
* Generate a zip of the given snapshot.
*
* @param {number} projectId
* @param {number | string} projectId
* @param {number} version
* @param {Snapshot} snapshot
*/

View file

@ -53,4 +53,49 @@ describe('zipStore', function () {
expect(entries.length).to.equal(1)
expect(entries[0].fileName).to.equal('hello.txt')
})
it('filters out tracked deletes', async function () {
const projectId = fixtures.docs.uninitializedProject.id
const version = 1
const testSnapshot = new Snapshot()
testSnapshot.addFile(
'test.tex',
File.fromRaw({
content: 'the quick brown fox jumps over the lazy dog',
trackedChanges: [
{
range: { pos: 4, length: 6 },
tracking: {
type: 'delete',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
],
})
)
const zipUrl = await zipStore.getSignedUrl(projectId, version)
// Build the zip file.
await zipStore.storeZip(projectId, version, testSnapshot)
// Now we should be able to fetch it.
const postZipResponse = await fetch(zipUrl)
expect(postZipResponse.status).to.equal(200)
const zipBuffer = await postZipResponse.buffer()
await fs.writeFileAsync(zipFilePath, zipBuffer)
const entries = await getZipEntries(zipFilePath)
expect(entries.length).to.equal(1)
expect(entries[0].fileName).to.equal('test.tex')
expect(entries[0].uncompressedSize).to.equal(
'the brown fox jumps over the dog'.length
)
})
})