Merge pull request #17359 from overleaf/mj-download-filtering

[overleaf-editor-core+history-v1] Filter tracked deletes when creating zip files

GitOrigin-RevId: 1c19d3cb849150d33e750772399ea81c280fdd57
This commit is contained in:
Mathias Jakobsen 2024-03-12 12:14:02 +00:00 committed by Copybot
parent 4ef7bc617b
commit ce19b5f568
10 changed files with 247 additions and 87 deletions

View file

@ -14,6 +14,7 @@ const StringFileData = require('./file_data/string_file_data')
* @typedef {import("./types").StringFileRawData} StringFileRawData * @typedef {import("./types").StringFileRawData} StringFileRawData
* @typedef {import("./types").CommentRawData} CommentRawData * @typedef {import("./types").CommentRawData} CommentRawData
* @typedef {import("./operation/text_operation")} TextOperation * @typedef {import("./operation/text_operation")} TextOperation
* @typedef {{filterTrackedDeletes?: boolean}} FileGetContentOptions
*/ */
class NotEditableError extends OError { class NotEditableError extends OError {
@ -133,10 +134,11 @@ class File {
* The content of the file, if it is known and if this file has UTF-8 encoded * The content of the file, if it is known and if this file has UTF-8 encoded
* content. * content.
* *
* @param {FileGetContentOptions} [opts]
* @return {string | null | undefined} * @return {string | null | undefined}
*/ */
getContent() { getContent(opts = {}) {
return this.data.getContent() return this.data.getContent(opts)
} }
/** /**

View file

@ -90,6 +90,9 @@ class HashFileData extends FileData {
let rangesBlob let rangesBlob
if (this.rangesHash) { if (this.rangesHash) {
rangesBlob = await blobStore.getBlob(this.rangesHash) rangesBlob = await blobStore.getBlob(this.rangesHash)
if (!rangesBlob) {
throw new Error('Failed to look up rangesHash in blobStore')
}
} }
if (!blob) throw new Error('blob not found: ' + this.hash) if (!blob) throw new Error('blob not found: ' + this.hash)
return FileData.createLazyFromBlobs(blob, rangesBlob) return FileData.createLazyFromBlobs(blob, rangesBlob)
@ -102,6 +105,9 @@ class HashFileData extends FileData {
*/ */
async toHollow(blobStore) { async toHollow(blobStore) {
const blob = await blobStore.getBlob(this.hash) const blob = await blobStore.getBlob(this.hash)
if (!blob) {
throw new Error('Failed to look up hash in blobStore')
}
return FileData.createHollow(blob.getByteLength(), blob.getStringLength()) return FileData.createHollow(blob.getByteLength(), blob.getStringLength())
} }

View file

@ -90,9 +90,10 @@ class FileData {
/** /**
* @see File#getContent * @see File#getContent
* @param {import('../file').FileGetContentOptions} [opts]
* @return {string | null | undefined} * @return {string | null | undefined}
*/ */
getContent() { getContent(opts = {}) {
return null return null
} }

View file

@ -66,9 +66,29 @@ class StringFileData extends FileData {
return true return true
} }
/** @inheritdoc */ /**
getContent() { * @inheritdoc
return this.content * @param {import('../file').FileGetContentOptions} [opts]
*/
getContent(opts = {}) {
let content = ''
let cursor = 0
if (opts.filterTrackedDeletes) {
for (const tc of this.trackedChanges.trackedChanges) {
if (tc.tracking.type !== 'delete') {
continue
}
if (cursor < tc.range.start) {
content += this.content.slice(cursor, tc.range.start)
}
// skip the tracked change
cursor = tc.range.end
}
}
if (cursor < this.content.length) {
content += this.content.slice(cursor)
}
return content
} }
/** @inheritdoc */ /** @inheritdoc */

View file

@ -1,7 +1,7 @@
import Blob from './blob' import Blob from './blob'
export type BlobStore = { export type BlobStore = {
getBlob(hash: string): Promise<Blob> getBlob(hash: string): Promise<Blob | null>
getString(hash: string): Promise<string> getString(hash: string): Promise<string>
putString(content: string): Promise<Blob> putString(content: string): Promise<Blob>
putObject(obj: object): Promise<Blob> putObject(obj: object): Promise<Blob>

View file

@ -101,4 +101,68 @@ describe('StringFileData', function () {
{ id: 'comm2', ranges: [{ pos: 20, length: 5 }], resolved: true }, { id: 'comm2', ranges: [{ pos: 20, length: 5 }], resolved: true },
]) ])
}) })
it('getContent should filter out tracked deletions when passed option', function () {
const fileData = new StringFileData(
'the quick brown fox jumps over the lazy dog',
undefined,
[
{
range: { pos: 4, length: 6 },
tracking: {
type: 'delete',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
]
)
expect(fileData.getContent()).to.equal(
'the quick brown fox jumps over the lazy dog'
)
expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal(
'the brown fox jumps over the dog'
)
})
it('getContent should keep tracked insertions when passed option to remove tracked changes', function () {
const fileData = new StringFileData(
'the quick brown fox jumps over the lazy dog',
undefined,
[
{
range: { pos: 4, length: 6 },
tracking: {
type: 'insert',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
]
)
expect(fileData.getContent()).to.equal(
'the quick brown fox jumps over the lazy dog'
)
expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal(
'the quick brown fox jumps over the dog'
)
})
}) })

View file

@ -22,6 +22,8 @@ const postgresBackend = require('./postgres')
const mongoBackend = require('./mongo') const mongoBackend = require('./mongo')
const logger = require('@overleaf/logger') const logger = require('@overleaf/logger')
/** @typedef {import('stream').Readable} Readable */
const GLOBAL_BLOBS = new Map() const GLOBAL_BLOBS = new Map()
function makeGlobalKey(hash) { function makeGlobalKey(hash) {
@ -273,7 +275,7 @@ class BlobStore {
* failure, so the caller must be prepared to retry on errors, if appropriate. * failure, so the caller must be prepared to retry on errors, if appropriate.
* *
* @param {string} hash hexadecimal SHA-1 hash * @param {string} hash hexadecimal SHA-1 hash
* @return {stream} a stream to read the file * @return {Promise.<Readable>} a stream to read the file
*/ */
async getStream(hash) { async getStream(hash) {
assert.blobHash(hash, 'bad hash') assert.blobHash(hash, 'bad hash')

View file

@ -1,16 +1,24 @@
// @ts-check
'use strict' 'use strict'
/** @typedef {import('overleaf-editor-core/types').Snapshot} Snapshot */
const Archive = require('archiver') const Archive = require('archiver')
const BPromise = require('bluebird') const BPromise = require('bluebird')
const fs = require('fs') const fs = require('fs')
const { pipeline } = require('stream') const { pipeline } = require('stream')
const core = require('overleaf-editor-core') const core = require('overleaf-editor-core')
const Snapshot = core.Snapshot const Snapshot = core.Snapshot
const OError = require('@overleaf/o-error') const OError = require('@overleaf/o-error')
const assert = require('./assert') const assert = require('./assert')
/**
* @typedef {import('../../storage/lib/blob_store/index').BlobStore} BlobStore
*/
// The maximum safe concurrency appears to be 1. // The maximum safe concurrency appears to be 1.
// https://github.com/overleaf/issues/issues/1909 // https://github.com/overleaf/issues/issues/1909
const FETCH_CONCURRENCY = 1 // number of files to fetch at once const FETCH_CONCURRENCY = 1 // number of files to fetch at once
@ -21,101 +29,113 @@ class DownloadError extends OError {
super(`ProjectArchive: blob download failed: ${hash}`, { hash }) super(`ProjectArchive: blob download failed: ${hash}`, { hash })
} }
} }
ProjectArchive.DownloadError = DownloadError
class ArchiveTimeout extends OError { class ArchiveTimeout extends OError {
constructor() { constructor() {
super('ProjectArchive timed out') super('ProjectArchive timed out')
} }
} }
ProjectArchive.ArchiveTimeout = ArchiveTimeout
/** class MissingfileError extends OError {
* @constructor constructor() {
* @param {Snapshot} snapshot super('ProjectArchive: attempting to look up a file that does not exist')
* @param {?number} timeout in ms }
* @classdesc
* Writes the project snapshot to a zip file.
*/
function ProjectArchive(snapshot, timeout) {
assert.instance(snapshot, Snapshot)
this.snapshot = snapshot
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
} }
/** class ProjectArchive {
* Write zip archive to the given file path. static ArchiveTimeout = ArchiveTimeout
* static MissingfileError = MissingfileError
* @param {BlobStore} blobStore static DownloadError = DownloadError
* @param {string} zipFilePath
*/
ProjectArchive.prototype.writeZip = function projectArchiveToZip(
blobStore,
zipFilePath
) {
const snapshot = this.snapshot
const timeout = this.timeout
const startTime = process.hrtime() /**
const archive = new Archive('zip') * @constructor
* @param {Snapshot} snapshot
// Convert elapsed seconds and nanoseconds to milliseconds. * @param {?number} timeout in ms
function findElapsedMilliseconds() { * @classdesc
const elapsed = process.hrtime(startTime) * Writes the project snapshot to a zip file.
return elapsed[0] * 1e3 + elapsed[1] * 1e-6 */
constructor(snapshot, timeout) {
assert.instance(snapshot, Snapshot)
this.snapshot = snapshot
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
} }
function addFileToArchive(pathname) { /**
if (findElapsedMilliseconds() > timeout) { * Write zip archive to the given file path.
throw new ProjectArchive.ArchiveTimeout() *
* @param {BlobStore} blobStore
* @param {string} zipFilePath
*/
writeZip(blobStore, zipFilePath) {
const snapshot = this.snapshot
const timeout = this.timeout
const startTime = process.hrtime()
const archive = new Archive('zip')
// Convert elapsed seconds and nanoseconds to milliseconds.
function findElapsedMilliseconds() {
const elapsed = process.hrtime(startTime)
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
} }
const file = snapshot.getFile(pathname) function addFileToArchive(pathname) {
return file.load('eager', blobStore).then(function () { if (findElapsedMilliseconds() > timeout) {
const content = file.getContent() throw new ProjectArchive.ArchiveTimeout()
if (content === null) {
return streamFileToArchive(pathname, file).catch(function (err) {
throw new ProjectArchive.DownloadError(file.getHash()).withCause(err)
})
} else {
archive.append(content, { name: pathname })
} }
})
}
function streamFileToArchive(pathname, file) { const file = snapshot.getFile(pathname)
return new BPromise(function (resolve, reject) { if (!file) {
blobStore throw new ProjectArchive.MissingfileError()
.getStream(file.getHash())
.then(stream => {
stream.on('error', reject)
stream.on('end', resolve)
archive.append(stream, { name: pathname })
})
.catch(reject)
})
}
const addFilesToArchiveAndFinalize = BPromise.map(
snapshot.getFilePathnames(),
addFileToArchive,
{ concurrency: FETCH_CONCURRENCY }
).then(function () {
archive.finalize()
})
const streamArchiveToFile = new BPromise(function (resolve, reject) {
const stream = fs.createWriteStream(zipFilePath)
pipeline(archive, stream, function (err) {
if (err) {
reject(err)
} else {
resolve()
} }
}) return file.load('eager', blobStore).then(function () {
}) const content = file.getContent({ filterTrackedDeletes: true })
if (content === null) {
return streamFileToArchive(pathname, file).catch(function (err) {
throw new ProjectArchive.DownloadError(file.getHash()).withCause(
err
)
})
} else {
archive.append(content, { name: pathname })
}
})
}
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize) function streamFileToArchive(pathname, file) {
return new BPromise(function (resolve, reject) {
blobStore
.getStream(file.getHash())
.then(stream => {
stream.on('error', reject)
stream.on('end', resolve)
archive.append(stream, { name: pathname })
})
.catch(reject)
})
}
const addFilesToArchiveAndFinalize = BPromise.map(
snapshot.getFilePathnames(),
addFileToArchive,
{ concurrency: FETCH_CONCURRENCY }
).then(function () {
archive.finalize()
})
const streamArchiveToFile = new BPromise(function (resolve, reject) {
const stream = fs.createWriteStream(zipFilePath)
pipeline(archive, stream, function (err) {
if (err) {
reject(err)
} else {
resolve()
}
})
})
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
}
} }
module.exports = ProjectArchive module.exports = ProjectArchive

View file

@ -34,7 +34,7 @@ class ZipStore {
/** /**
* Generate signed link to access the zip file. * Generate signed link to access the zip file.
* *
* @param {number} projectId * @param {number | string} projectId
* @param {number} version * @param {number} version
* @return {string} * @return {string}
*/ */
@ -49,7 +49,7 @@ class ZipStore {
/** /**
* Generate a zip of the given snapshot. * Generate a zip of the given snapshot.
* *
* @param {number} projectId * @param {number | string} projectId
* @param {number} version * @param {number} version
* @param {Snapshot} snapshot * @param {Snapshot} snapshot
*/ */

View file

@ -53,4 +53,49 @@ describe('zipStore', function () {
expect(entries.length).to.equal(1) expect(entries.length).to.equal(1)
expect(entries[0].fileName).to.equal('hello.txt') expect(entries[0].fileName).to.equal('hello.txt')
}) })
it('filters out tracked deletes', async function () {
const projectId = fixtures.docs.uninitializedProject.id
const version = 1
const testSnapshot = new Snapshot()
testSnapshot.addFile(
'test.tex',
File.fromRaw({
content: 'the quick brown fox jumps over the lazy dog',
trackedChanges: [
{
range: { pos: 4, length: 6 },
tracking: {
type: 'delete',
ts: '2024-01-01T00:00:00.000Z',
userId: 'user1',
},
},
{
range: { pos: 35, length: 5 },
tracking: {
type: 'delete',
ts: '2023-01-01T00:00:00.000Z',
userId: 'user2',
},
},
],
})
)
const zipUrl = await zipStore.getSignedUrl(projectId, version)
// Build the zip file.
await zipStore.storeZip(projectId, version, testSnapshot)
// Now we should be able to fetch it.
const postZipResponse = await fetch(zipUrl)
expect(postZipResponse.status).to.equal(200)
const zipBuffer = await postZipResponse.buffer()
await fs.writeFileAsync(zipFilePath, zipBuffer)
const entries = await getZipEntries(zipFilePath)
expect(entries.length).to.equal(1)
expect(entries[0].fileName).to.equal('test.tex')
expect(entries[0].uncompressedSize).to.equal(
'the brown fox jumps over the dog'.length
)
})
}) })