From ce19b5f5680a0798578ad8eb848a0d49f214e9bf Mon Sep 17 00:00:00 2001 From: Mathias Jakobsen Date: Tue, 12 Mar 2024 12:14:02 +0000 Subject: [PATCH] Merge pull request #17359 from overleaf/mj-download-filtering [overleaf-editor-core+history-v1] Filter tracked deletes when creating zip files GitOrigin-RevId: 1c19d3cb849150d33e750772399ea81c280fdd57 --- libraries/overleaf-editor-core/lib/file.js | 6 +- .../lib/file_data/hash_file_data.js | 6 + .../lib/file_data/index.js | 3 +- .../lib/file_data/string_file_data.js | 26 ++- libraries/overleaf-editor-core/lib/types.ts | 2 +- .../test/string_file_data.test.js | 64 +++++++ .../storage/lib/blob_store/index.js | 4 +- .../history-v1/storage/lib/project_archive.js | 174 ++++++++++-------- services/history-v1/storage/lib/zip_store.js | 4 +- .../acceptance/js/storage/zip_store.test.js | 45 +++++ 10 files changed, 247 insertions(+), 87 deletions(-) diff --git a/libraries/overleaf-editor-core/lib/file.js b/libraries/overleaf-editor-core/lib/file.js index 7ad4d97663..db4941bb8a 100644 --- a/libraries/overleaf-editor-core/lib/file.js +++ b/libraries/overleaf-editor-core/lib/file.js @@ -14,6 +14,7 @@ const StringFileData = require('./file_data/string_file_data') * @typedef {import("./types").StringFileRawData} StringFileRawData * @typedef {import("./types").CommentRawData} CommentRawData * @typedef {import("./operation/text_operation")} TextOperation + * @typedef {{filterTrackedDeletes?: boolean}} FileGetContentOptions */ class NotEditableError extends OError { @@ -133,10 +134,11 @@ class File { * The content of the file, if it is known and if this file has UTF-8 encoded * content. * + * @param {FileGetContentOptions} [opts] * @return {string | null | undefined} */ - getContent() { - return this.data.getContent() + getContent(opts = {}) { + return this.data.getContent(opts) } /** diff --git a/libraries/overleaf-editor-core/lib/file_data/hash_file_data.js b/libraries/overleaf-editor-core/lib/file_data/hash_file_data.js index 0105e7cc19..eccb9de0ce 100644 --- a/libraries/overleaf-editor-core/lib/file_data/hash_file_data.js +++ b/libraries/overleaf-editor-core/lib/file_data/hash_file_data.js @@ -90,6 +90,9 @@ class HashFileData extends FileData { let rangesBlob if (this.rangesHash) { rangesBlob = await blobStore.getBlob(this.rangesHash) + if (!rangesBlob) { + throw new Error('Failed to look up rangesHash in blobStore') + } } if (!blob) throw new Error('blob not found: ' + this.hash) return FileData.createLazyFromBlobs(blob, rangesBlob) @@ -102,6 +105,9 @@ class HashFileData extends FileData { */ async toHollow(blobStore) { const blob = await blobStore.getBlob(this.hash) + if (!blob) { + throw new Error('Failed to look up hash in blobStore') + } return FileData.createHollow(blob.getByteLength(), blob.getStringLength()) } diff --git a/libraries/overleaf-editor-core/lib/file_data/index.js b/libraries/overleaf-editor-core/lib/file_data/index.js index fc1adca3e9..30f7f2e01c 100644 --- a/libraries/overleaf-editor-core/lib/file_data/index.js +++ b/libraries/overleaf-editor-core/lib/file_data/index.js @@ -90,9 +90,10 @@ class FileData { /** * @see File#getContent + * @param {import('../file').FileGetContentOptions} [opts] * @return {string | null | undefined} */ - getContent() { + getContent(opts = {}) { return null } diff --git a/libraries/overleaf-editor-core/lib/file_data/string_file_data.js b/libraries/overleaf-editor-core/lib/file_data/string_file_data.js index d9b69885fb..698012bd82 100644 --- a/libraries/overleaf-editor-core/lib/file_data/string_file_data.js +++ b/libraries/overleaf-editor-core/lib/file_data/string_file_data.js @@ -66,9 +66,29 @@ class StringFileData extends FileData { return true } - /** @inheritdoc */ - getContent() { - return this.content + /** + * @inheritdoc + * @param {import('../file').FileGetContentOptions} [opts] + */ + getContent(opts = {}) { + let content = '' + let cursor = 0 + if (opts.filterTrackedDeletes) { + for (const tc of this.trackedChanges.trackedChanges) { + if (tc.tracking.type !== 'delete') { + continue + } + if (cursor < tc.range.start) { + content += this.content.slice(cursor, tc.range.start) + } + // skip the tracked change + cursor = tc.range.end + } + } + if (cursor < this.content.length) { + content += this.content.slice(cursor) + } + return content } /** @inheritdoc */ diff --git a/libraries/overleaf-editor-core/lib/types.ts b/libraries/overleaf-editor-core/lib/types.ts index a537828b75..3227d3baf9 100644 --- a/libraries/overleaf-editor-core/lib/types.ts +++ b/libraries/overleaf-editor-core/lib/types.ts @@ -1,7 +1,7 @@ import Blob from './blob' export type BlobStore = { - getBlob(hash: string): Promise + getBlob(hash: string): Promise getString(hash: string): Promise putString(content: string): Promise putObject(obj: object): Promise diff --git a/libraries/overleaf-editor-core/test/string_file_data.test.js b/libraries/overleaf-editor-core/test/string_file_data.test.js index 2f3e13e2cb..9c898c66dd 100644 --- a/libraries/overleaf-editor-core/test/string_file_data.test.js +++ b/libraries/overleaf-editor-core/test/string_file_data.test.js @@ -101,4 +101,68 @@ describe('StringFileData', function () { { id: 'comm2', ranges: [{ pos: 20, length: 5 }], resolved: true }, ]) }) + + it('getContent should filter out tracked deletions when passed option', function () { + const fileData = new StringFileData( + 'the quick brown fox jumps over the lazy dog', + undefined, + [ + { + range: { pos: 4, length: 6 }, + tracking: { + type: 'delete', + ts: '2024-01-01T00:00:00.000Z', + userId: 'user1', + }, + }, + { + range: { pos: 35, length: 5 }, + tracking: { + type: 'delete', + ts: '2023-01-01T00:00:00.000Z', + userId: 'user2', + }, + }, + ] + ) + + expect(fileData.getContent()).to.equal( + 'the quick brown fox jumps over the lazy dog' + ) + expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal( + 'the brown fox jumps over the dog' + ) + }) + + it('getContent should keep tracked insertions when passed option to remove tracked changes', function () { + const fileData = new StringFileData( + 'the quick brown fox jumps over the lazy dog', + undefined, + [ + { + range: { pos: 4, length: 6 }, + tracking: { + type: 'insert', + ts: '2024-01-01T00:00:00.000Z', + userId: 'user1', + }, + }, + { + range: { pos: 35, length: 5 }, + tracking: { + type: 'delete', + ts: '2023-01-01T00:00:00.000Z', + userId: 'user2', + }, + }, + ] + ) + + expect(fileData.getContent()).to.equal( + 'the quick brown fox jumps over the lazy dog' + ) + expect(fileData.getContent({ filterTrackedDeletes: true })).to.equal( + 'the quick brown fox jumps over the dog' + ) + }) }) diff --git a/services/history-v1/storage/lib/blob_store/index.js b/services/history-v1/storage/lib/blob_store/index.js index 7935810292..81660797db 100644 --- a/services/history-v1/storage/lib/blob_store/index.js +++ b/services/history-v1/storage/lib/blob_store/index.js @@ -22,6 +22,8 @@ const postgresBackend = require('./postgres') const mongoBackend = require('./mongo') const logger = require('@overleaf/logger') +/** @typedef {import('stream').Readable} Readable */ + const GLOBAL_BLOBS = new Map() function makeGlobalKey(hash) { @@ -273,7 +275,7 @@ class BlobStore { * failure, so the caller must be prepared to retry on errors, if appropriate. * * @param {string} hash hexadecimal SHA-1 hash - * @return {stream} a stream to read the file + * @return {Promise.} a stream to read the file */ async getStream(hash) { assert.blobHash(hash, 'bad hash') diff --git a/services/history-v1/storage/lib/project_archive.js b/services/history-v1/storage/lib/project_archive.js index 6e0cd2b083..31d3b62444 100644 --- a/services/history-v1/storage/lib/project_archive.js +++ b/services/history-v1/storage/lib/project_archive.js @@ -1,16 +1,24 @@ +// @ts-check 'use strict' +/** @typedef {import('overleaf-editor-core/types').Snapshot} Snapshot */ + const Archive = require('archiver') const BPromise = require('bluebird') const fs = require('fs') const { pipeline } = require('stream') const core = require('overleaf-editor-core') + const Snapshot = core.Snapshot const OError = require('@overleaf/o-error') const assert = require('./assert') +/** + * @typedef {import('../../storage/lib/blob_store/index').BlobStore} BlobStore + */ + // The maximum safe concurrency appears to be 1. // https://github.com/overleaf/issues/issues/1909 const FETCH_CONCURRENCY = 1 // number of files to fetch at once @@ -21,101 +29,113 @@ class DownloadError extends OError { super(`ProjectArchive: blob download failed: ${hash}`, { hash }) } } -ProjectArchive.DownloadError = DownloadError class ArchiveTimeout extends OError { constructor() { super('ProjectArchive timed out') } } -ProjectArchive.ArchiveTimeout = ArchiveTimeout -/** - * @constructor - * @param {Snapshot} snapshot - * @param {?number} timeout in ms - * @classdesc - * Writes the project snapshot to a zip file. - */ -function ProjectArchive(snapshot, timeout) { - assert.instance(snapshot, Snapshot) - this.snapshot = snapshot - this.timeout = timeout || DEFAULT_ZIP_TIMEOUT +class MissingfileError extends OError { + constructor() { + super('ProjectArchive: attempting to look up a file that does not exist') + } } -/** - * Write zip archive to the given file path. - * - * @param {BlobStore} blobStore - * @param {string} zipFilePath - */ -ProjectArchive.prototype.writeZip = function projectArchiveToZip( - blobStore, - zipFilePath -) { - const snapshot = this.snapshot - const timeout = this.timeout +class ProjectArchive { + static ArchiveTimeout = ArchiveTimeout + static MissingfileError = MissingfileError + static DownloadError = DownloadError - const startTime = process.hrtime() - const archive = new Archive('zip') - - // Convert elapsed seconds and nanoseconds to milliseconds. - function findElapsedMilliseconds() { - const elapsed = process.hrtime(startTime) - return elapsed[0] * 1e3 + elapsed[1] * 1e-6 + /** + * @constructor + * @param {Snapshot} snapshot + * @param {?number} timeout in ms + * @classdesc + * Writes the project snapshot to a zip file. + */ + constructor(snapshot, timeout) { + assert.instance(snapshot, Snapshot) + this.snapshot = snapshot + this.timeout = timeout || DEFAULT_ZIP_TIMEOUT } - function addFileToArchive(pathname) { - if (findElapsedMilliseconds() > timeout) { - throw new ProjectArchive.ArchiveTimeout() + /** + * Write zip archive to the given file path. + * + * @param {BlobStore} blobStore + * @param {string} zipFilePath + */ + writeZip(blobStore, zipFilePath) { + const snapshot = this.snapshot + const timeout = this.timeout + + const startTime = process.hrtime() + const archive = new Archive('zip') + + // Convert elapsed seconds and nanoseconds to milliseconds. + function findElapsedMilliseconds() { + const elapsed = process.hrtime(startTime) + return elapsed[0] * 1e3 + elapsed[1] * 1e-6 } - const file = snapshot.getFile(pathname) - return file.load('eager', blobStore).then(function () { - const content = file.getContent() - if (content === null) { - return streamFileToArchive(pathname, file).catch(function (err) { - throw new ProjectArchive.DownloadError(file.getHash()).withCause(err) - }) - } else { - archive.append(content, { name: pathname }) + function addFileToArchive(pathname) { + if (findElapsedMilliseconds() > timeout) { + throw new ProjectArchive.ArchiveTimeout() } - }) - } - function streamFileToArchive(pathname, file) { - return new BPromise(function (resolve, reject) { - blobStore - .getStream(file.getHash()) - .then(stream => { - stream.on('error', reject) - stream.on('end', resolve) - archive.append(stream, { name: pathname }) - }) - .catch(reject) - }) - } - - const addFilesToArchiveAndFinalize = BPromise.map( - snapshot.getFilePathnames(), - addFileToArchive, - { concurrency: FETCH_CONCURRENCY } - ).then(function () { - archive.finalize() - }) - - const streamArchiveToFile = new BPromise(function (resolve, reject) { - const stream = fs.createWriteStream(zipFilePath) - pipeline(archive, stream, function (err) { - if (err) { - reject(err) - } else { - resolve() + const file = snapshot.getFile(pathname) + if (!file) { + throw new ProjectArchive.MissingfileError() } - }) - }) + return file.load('eager', blobStore).then(function () { + const content = file.getContent({ filterTrackedDeletes: true }) + if (content === null) { + return streamFileToArchive(pathname, file).catch(function (err) { + throw new ProjectArchive.DownloadError(file.getHash()).withCause( + err + ) + }) + } else { + archive.append(content, { name: pathname }) + } + }) + } - return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize) + function streamFileToArchive(pathname, file) { + return new BPromise(function (resolve, reject) { + blobStore + .getStream(file.getHash()) + .then(stream => { + stream.on('error', reject) + stream.on('end', resolve) + archive.append(stream, { name: pathname }) + }) + .catch(reject) + }) + } + + const addFilesToArchiveAndFinalize = BPromise.map( + snapshot.getFilePathnames(), + addFileToArchive, + { concurrency: FETCH_CONCURRENCY } + ).then(function () { + archive.finalize() + }) + + const streamArchiveToFile = new BPromise(function (resolve, reject) { + const stream = fs.createWriteStream(zipFilePath) + pipeline(archive, stream, function (err) { + if (err) { + reject(err) + } else { + resolve() + } + }) + }) + + return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize) + } } module.exports = ProjectArchive diff --git a/services/history-v1/storage/lib/zip_store.js b/services/history-v1/storage/lib/zip_store.js index ed5121ed8a..b3cf7c2663 100644 --- a/services/history-v1/storage/lib/zip_store.js +++ b/services/history-v1/storage/lib/zip_store.js @@ -34,7 +34,7 @@ class ZipStore { /** * Generate signed link to access the zip file. * - * @param {number} projectId + * @param {number | string} projectId * @param {number} version * @return {string} */ @@ -49,7 +49,7 @@ class ZipStore { /** * Generate a zip of the given snapshot. * - * @param {number} projectId + * @param {number | string} projectId * @param {number} version * @param {Snapshot} snapshot */ diff --git a/services/history-v1/test/acceptance/js/storage/zip_store.test.js b/services/history-v1/test/acceptance/js/storage/zip_store.test.js index 756873d5ad..367fb6a946 100644 --- a/services/history-v1/test/acceptance/js/storage/zip_store.test.js +++ b/services/history-v1/test/acceptance/js/storage/zip_store.test.js @@ -53,4 +53,49 @@ describe('zipStore', function () { expect(entries.length).to.equal(1) expect(entries[0].fileName).to.equal('hello.txt') }) + + it('filters out tracked deletes', async function () { + const projectId = fixtures.docs.uninitializedProject.id + const version = 1 + const testSnapshot = new Snapshot() + testSnapshot.addFile( + 'test.tex', + File.fromRaw({ + content: 'the quick brown fox jumps over the lazy dog', + trackedChanges: [ + { + range: { pos: 4, length: 6 }, + tracking: { + type: 'delete', + ts: '2024-01-01T00:00:00.000Z', + userId: 'user1', + }, + }, + { + range: { pos: 35, length: 5 }, + tracking: { + type: 'delete', + ts: '2023-01-01T00:00:00.000Z', + userId: 'user2', + }, + }, + ], + }) + ) + + const zipUrl = await zipStore.getSignedUrl(projectId, version) + // Build the zip file. + await zipStore.storeZip(projectId, version, testSnapshot) + // Now we should be able to fetch it. + const postZipResponse = await fetch(zipUrl) + expect(postZipResponse.status).to.equal(200) + const zipBuffer = await postZipResponse.buffer() + await fs.writeFileAsync(zipFilePath, zipBuffer) + const entries = await getZipEntries(zipFilePath) + expect(entries.length).to.equal(1) + expect(entries[0].fileName).to.equal('test.tex') + expect(entries[0].uncompressedSize).to.equal( + 'the brown fox jumps over the dog'.length + ) + }) })