From 97b39ef98f70aaea4362f137612b826dfda8eb25 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Fri, 15 Nov 2024 15:45:04 +0100 Subject: [PATCH] Merge pull request #21681 from overleaf/jpa-back-fill-file-hash [history-v1] add script for back filling hash into project file-tree GitOrigin-RevId: 70ab8c72f3eea1062708e4471b4ad1c60317ad26 --- libraries/mongo-utils/batchedUpdate.js | 1 + .../object-persistor/src/AbstractPersistor.js | 8 +- .../src/PerProjectEncryptedS3Persistor.js | 2 + libraries/object-persistor/src/S3Persistor.js | 18 +- package-lock.json | 25 + services/history-v1/buildscript.txt | 2 +- .../config/custom-environment-variables.json | 21 + services/history-v1/config/default.json | 10 + services/history-v1/config/development.json | 12 + services/history-v1/config/production.json | 6 +- services/history-v1/config/test.json | 13 + services/history-v1/docker-compose.ci.yml | 145 +++ services/history-v1/docker-compose.yml | 144 +++ services/history-v1/package.json | 3 + .../storage/lib/backupPersistor.mjs | 107 ++ .../storage/lib/blob_store/index.js | 2 +- services/history-v1/storage/lib/mongodb.js | 14 +- .../storage/scripts/back_fill_file_hash.mjs | 824 ++++++++++++++ .../test/acceptance/certs/.gitignore | 2 + .../js/storage/back_fill_file_hash.test.mjs | 1009 +++++++++++++++++ .../acceptance/js/storage/support/cleanup.js | 6 + services/history-v1/test/setup.js | 1 + 22 files changed, 2363 insertions(+), 12 deletions(-) create mode 100644 services/history-v1/storage/lib/backupPersistor.mjs create mode 100644 services/history-v1/storage/scripts/back_fill_file_hash.mjs create mode 100644 services/history-v1/test/acceptance/certs/.gitignore create mode 100644 services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs diff --git a/libraries/mongo-utils/batchedUpdate.js b/libraries/mongo-utils/batchedUpdate.js index 89965e4aa9..be0370956a 100644 --- a/libraries/mongo-utils/batchedUpdate.js +++ b/libraries/mongo-utils/batchedUpdate.js @@ -277,6 +277,7 @@ function batchedUpdateWithResultHandling( } module.exports = { + READ_PREFERENCE_SECONDARY, batchedUpdate, batchedUpdateWithResultHandling, } diff --git a/libraries/object-persistor/src/AbstractPersistor.js b/libraries/object-persistor/src/AbstractPersistor.js index a1a432d9fc..3352766655 100644 --- a/libraries/object-persistor/src/AbstractPersistor.js +++ b/libraries/object-persistor/src/AbstractPersistor.js @@ -35,12 +35,12 @@ module.exports = class AbstractPersistor { /** * @param location * @param name - * @param {Object} opts - * @param {Number} opts.start - * @param {Number} opts.end + * @param {Object} [opts] + * @param {Number} [opts.start] + * @param {Number} [opts.end] * @return {Promise} */ - async getObjectStream(location, name, opts) { + async getObjectStream(location, name, opts = {}) { throw new NotImplementedError('method not implemented in persistor', { method: 'getObjectStream', location, diff --git a/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js b/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js index ea90578760..a5172ac464 100644 --- a/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js +++ b/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js @@ -201,6 +201,7 @@ class PerProjectEncryptedS3Persistor extends S3Persistor { // Do not overwrite any objects if already created ifNoneMatch: '*', ssecOptions: await this.#getCurrentKeyEncryptionKey(projectFolder), + contentLength: 32, } ) return new SSECOptions(dataEncryptionKey) @@ -404,6 +405,7 @@ class CachedPerProjectEncryptedS3Persistor { * @param {Object} opts * @param {string} [opts.contentType] * @param {string} [opts.contentEncoding] + * @param {number} [opts.contentLength] * @param {'*'} [opts.ifNoneMatch] * @param {SSECOptions} [opts.ssecOptions] * @param {string} [opts.sourceMd5] diff --git a/libraries/object-persistor/src/S3Persistor.js b/libraries/object-persistor/src/S3Persistor.js index 4f438c445c..0a9ff6d260 100644 --- a/libraries/object-persistor/src/S3Persistor.js +++ b/libraries/object-persistor/src/S3Persistor.js @@ -85,6 +85,7 @@ class S3Persistor extends AbstractPersistor { * @param {Object} opts * @param {string} [opts.contentType] * @param {string} [opts.contentEncoding] + * @param {number} [opts.contentLength] * @param {'*'} [opts.ifNoneMatch] * @param {SSECOptions} [opts.ssecOptions] * @param {string} [opts.sourceMd5] @@ -118,6 +119,9 @@ class S3Persistor extends AbstractPersistor { if (opts.contentEncoding) { uploadOptions.ContentEncoding = opts.contentEncoding } + if (opts.contentLength) { + uploadOptions.ContentLength = opts.contentLength + } if (opts.ifNoneMatch === '*') { uploadOptions.IfNoneMatch = '*' } @@ -134,9 +138,15 @@ class S3Persistor extends AbstractPersistor { clientOptions.computeChecksums = true } - await this._getClientForBucket(bucketName, clientOptions) - .upload(uploadOptions, { partSize: this.settings.partSize }) - .promise() + if (this.settings.disableMultiPartUpload) { + await this._getClientForBucket(bucketName, clientOptions) + .putObject(uploadOptions) + .promise() + } else { + await this._getClientForBucket(bucketName, clientOptions) + .upload(uploadOptions, { partSize: this.settings.partSize }) + .promise() + } } catch (err) { throw PersistorHelper.wrapError( err, @@ -150,7 +160,7 @@ class S3Persistor extends AbstractPersistor { /** * @param {string} bucketName * @param {string} key - * @param {Object} opts + * @param {Object} [opts] * @param {number} [opts.start] * @param {number} [opts.end] * @param {boolean} [opts.autoGunzip] diff --git a/package-lock.json b/package-lock.json index 042625d7d2..e16ad3b12d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5759,6 +5759,17 @@ "node": ">=10" } }, + "node_modules/@google-cloud/secret-manager": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/@google-cloud/secret-manager/-/secret-manager-5.6.0.tgz", + "integrity": "sha512-0daW/OXQEVc6VQKPyJTQNyD+563I/TYQ7GCQJx4dq3lB666R9FUPvqHx9b/o/qQtZ5pfuoCbGZl3krpxgTSW8Q==", + "dependencies": { + "google-gax": "^4.0.3" + }, + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@google-cloud/storage": { "version": "6.10.1", "resolved": "https://registry.npmjs.org/@google-cloud/storage/-/storage-6.10.1.tgz", @@ -39688,10 +39699,13 @@ "version": "1.0.0", "license": "Proprietary", "dependencies": { + "@google-cloud/secret-manager": "^5.6.0", "@overleaf/logger": "*", "@overleaf/metrics": "*", + "@overleaf/mongo-utils": "*", "@overleaf/o-error": "*", "@overleaf/object-persistor": "*", + "@overleaf/promise-utils": "*", "@overleaf/redis-wrapper": "*", "@overleaf/settings": "*", "@overleaf/stream-utils": "^0.1.0", @@ -47889,6 +47903,14 @@ "resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-2.0.4.tgz", "integrity": "sha512-j8yRSSqswWi1QqUGKVEKOG03Q7qOoZP6/h2zN2YO+F5h2+DHU0bSrHCK9Y7lo2DI9fBd8qGAw795sf+3Jva4yA==" }, + "@google-cloud/secret-manager": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/@google-cloud/secret-manager/-/secret-manager-5.6.0.tgz", + "integrity": "sha512-0daW/OXQEVc6VQKPyJTQNyD+563I/TYQ7GCQJx4dq3lB666R9FUPvqHx9b/o/qQtZ5pfuoCbGZl3krpxgTSW8Q==", + "requires": { + "google-gax": "^4.0.3" + } + }, "@google-cloud/storage": { "version": "6.10.1", "resolved": "https://registry.npmjs.org/@google-cloud/storage/-/storage-6.10.1.tgz", @@ -69319,10 +69341,13 @@ "overleaf-editor": { "version": "file:services/history-v1", "requires": { + "@google-cloud/secret-manager": "^5.6.0", "@overleaf/logger": "*", "@overleaf/metrics": "*", + "@overleaf/mongo-utils": "*", "@overleaf/o-error": "*", "@overleaf/object-persistor": "*", + "@overleaf/promise-utils": "*", "@overleaf/redis-wrapper": "*", "@overleaf/settings": "*", "@overleaf/stream-utils": "^0.1.0", diff --git a/services/history-v1/buildscript.txt b/services/history-v1/buildscript.txt index f222dacf86..09b2c7f137 100644 --- a/services/history-v1/buildscript.txt +++ b/services/history-v1/buildscript.txt @@ -1,5 +1,5 @@ history-v1 ---dependencies=postgres,gcs,mongo +--dependencies=postgres,gcs,mongo,s3 --docker-repos=us-east1-docker.pkg.dev/overleaf-ops/ol-docker --env-add= --env-pass-through= diff --git a/services/history-v1/config/custom-environment-variables.json b/services/history-v1/config/custom-environment-variables.json index 1822d33aa4..6adeb78c4f 100644 --- a/services/history-v1/config/custom-environment-variables.json +++ b/services/history-v1/config/custom-environment-variables.json @@ -8,6 +8,8 @@ "s3": { "key": "AWS_ACCESS_KEY_ID", "secret": "AWS_SECRET_ACCESS_KEY", + "endpoint": "AWS_S3_ENDPOINT", + "pathStyle": "AWS_S3_PATH_STYLE", "maxRetries": "S3_MAX_RETRIES", "httpOptions": { "timeout": "S3_TIMEOUT" @@ -30,6 +32,19 @@ "buckets": "PERSISTOR_BUCKET_MAPPING" } }, + "backupPersistor": { + "keyEncryptionKeys": "BACKUP_KEY_ENCRYPTION_KEYS", + "s3SSEC": { + "key": "AWS_ACCESS_KEY_ID", + "secret": "AWS_SECRET_ACCESS_KEY", + "endpoint": "AWS_S3_ENDPOINT", + "pathStyle": "AWS_S3_PATH_STYLE", + "maxRetries": "BACKUP_S3_MAX_RETRIES", + "httpOptions": { + "timeout": "BACKUP_S3_TIMEOUT" + } + } + }, "blobStore": { "globalBucket": "OVERLEAF_EDITOR_BLOBS_BUCKET", "projectBucket": "OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET" @@ -42,6 +57,12 @@ "bucket": "OVERLEAF_EDITOR_ZIPS_BUCKET", "zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS" }, + "backupStore": { + "chunksBucket":"BACKUP_OVERLEAF_EDITOR_CHUNKS_BUCKET", + "deksBucket":"BACKUP_OVERLEAF_EDITOR_DEKS_BUCKET", + "globalBlobsBucket":"BACKUP_OVERLEAF_EDITOR_GLOBAL_BLOBS_BUCKET", + "projectBlobsBucket":"BACKUP_OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET" + }, "mongo": { "uri": "MONGO_CONNECTION_STRING" }, diff --git a/services/history-v1/config/default.json b/services/history-v1/config/default.json index 84fd220789..87e8e8ab3a 100644 --- a/services/history-v1/config/default.json +++ b/services/history-v1/config/default.json @@ -13,6 +13,16 @@ "deleteConcurrency": "50" } }, + "backupPersistor": { + "backend": "s3SSEC", + "s3SSEC": { + "maxRetries": "1", + "pathStyle": false, + "httpOptions": { + "timeout": "8000" + } + } + }, "chunkStore": { "historyStoreConcurrency": "4" }, diff --git a/services/history-v1/config/development.json b/services/history-v1/config/development.json index f1423290b5..9cd73c62c1 100644 --- a/services/history-v1/config/development.json +++ b/services/history-v1/config/development.json @@ -23,6 +23,18 @@ "zipStore": { "bucket": "overleaf-development-zips" }, + "backupStore": { + "chunksBucket":"overleaf-development-history-chunks", + "deksBucket":"overleaf-development-history-deks", + "globalBlobsBucket":"overleaf-development-history-global-blobs", + "projectBlobsBucket":"overleaf-development-history-project-blobs" + }, + "backupPersistor": { + "keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]", + "s3SSEC": { + "ca": "[\"/certs/public.crt\"]" + } + }, "useDeleteObjects": "false", "mongo": { "uri": "mongodb://mongo:27017/sharelatex" diff --git a/services/history-v1/config/production.json b/services/history-v1/config/production.json index ffcd4415b0..23f836b1f2 100644 --- a/services/history-v1/config/production.json +++ b/services/history-v1/config/production.json @@ -1 +1,5 @@ -{ } +{ + "backupPersistor": { + "tieringStorageClass": "INTELLIGENT_TIERING" + } +} diff --git a/services/history-v1/config/test.json b/services/history-v1/config/test.json index 1e4ddd3a0b..3550fcc0fd 100644 --- a/services/history-v1/config/test.json +++ b/services/history-v1/config/test.json @@ -20,6 +20,19 @@ "zipStore": { "bucket": "overleaf-test-zips" }, + "backupStore": { + "chunksBucket":"overleaf-test-history-chunks", + "deksBucket":"overleaf-test-history-deks", + "globalBlobsBucket":"overleaf-test-history-global-blobs", + "projectBlobsBucket":"overleaf-test-history-project-blobs" + }, + "backupPersistor": { + "keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]", + "s3SSEC": { + "ca": "[\"/certs/public.crt\"]" + }, + "tieringStorageClass": "REDUCED_REDUNDANCY" + }, "maxDeleteKeys": "3", "useDeleteObjects": "false", "mongo": { diff --git a/services/history-v1/docker-compose.ci.yml b/services/history-v1/docker-compose.ci.yml index 4de133e440..13a36ad6a7 100644 --- a/services/history-v1/docker-compose.ci.yml +++ b/services/history-v1/docker-compose.ci.yml @@ -21,17 +21,31 @@ services: ELASTIC_SEARCH_DSN: es:9200 MONGO_HOST: mongo POSTGRES_HOST: postgres + AWS_S3_ENDPOINT: https://minio:9000 + AWS_S3_PATH_STYLE: 'true' + AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY + MINIO_ROOT_USER: MINIO_ROOT_USER + MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD GCS_API_ENDPOINT: http://gcs:9090 GCS_PROJECT_ID: fake STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1 MOCHA_GREP: ${MOCHA_GREP} NODE_ENV: test NODE_OPTIONS: "--unhandled-rejections=strict" + volumes: + - ./test/acceptance/certs:/certs depends_on: mongo: condition: service_healthy postgres: condition: service_healthy + certs: + condition: service_completed_successfully + minio: + condition: service_started + minio_setup: + condition: service_completed_successfully gcs: condition: service_healthy user: node @@ -63,6 +77,137 @@ services: interval: 1s retries: 20 + certs: + image: node:18.20.4 + volumes: + - ./test/acceptance/certs:/certs + working_dir: /certs + entrypoint: sh + command: + - '-cex' + - | + if [ ! -f ./certgen ]; then + wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)" + chmod +x ./certgen + fi + if [ ! -f private.key ] || [ ! -f public.crt ]; then + ./certgen -host minio + fi + + minio: + image: minio/minio:RELEASE.2024-10-13T13-34-11Z + command: server /data + volumes: + - ./test/acceptance/certs:/root/.minio/certs + environment: + MINIO_ROOT_USER: MINIO_ROOT_USER + MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD + depends_on: + certs: + condition: service_completed_successfully + + minio_setup: + depends_on: + certs: + condition: service_completed_successfully + minio: + condition: service_started + image: minio/mc:RELEASE.2024-10-08T09-37-26Z + volumes: + - ./test/acceptance/certs:/root/.mc/certs/CAs + entrypoint: sh + command: + - '-cex' + - | + sleep 1 + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD + mc mb --ignore-existing s3/overleaf-test-history-chunks + mc mb --ignore-existing s3/overleaf-test-history-deks + mc mb --ignore-existing s3/overleaf-test-history-global-blobs + mc mb --ignore-existing s3/overleaf-test-history-project-blobs + mc admin user add s3 \ + OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \ + OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY + echo ' + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-chunks" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-chunks/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-deks" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-deks/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-global-blobs" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-project-blobs" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*" + } + ] + }' > policy-history.json + + mc admin policy create s3 overleaf-history policy-history.json + mc admin policy attach s3 overleaf-history \ + --user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID gcs: image: fsouza/fake-gcs-server:1.45.2 command: ["--port=9090", "--scheme=http"] diff --git a/services/history-v1/docker-compose.yml b/services/history-v1/docker-compose.yml index c0f7036308..b630e1e92f 100644 --- a/services/history-v1/docker-compose.yml +++ b/services/history-v1/docker-compose.yml @@ -25,11 +25,18 @@ services: - .:/overleaf/services/history-v1 - ../../node_modules:/overleaf/node_modules - ../../libraries:/overleaf/libraries + - ./test/acceptance/certs:/certs working_dir: /overleaf/services/history-v1 environment: ELASTIC_SEARCH_DSN: es:9200 MONGO_HOST: mongo POSTGRES_HOST: postgres + AWS_S3_ENDPOINT: https://minio:9000 + AWS_S3_PATH_STYLE: 'true' + AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY + MINIO_ROOT_USER: MINIO_ROOT_USER + MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD GCS_API_ENDPOINT: http://gcs:9090 GCS_PROJECT_ID: fake STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1 @@ -43,6 +50,12 @@ services: condition: service_healthy postgres: condition: service_healthy + certs: + condition: service_completed_successfully + minio: + condition: service_started + minio_setup: + condition: service_completed_successfully gcs: condition: service_healthy command: npm run --silent test:acceptance @@ -66,6 +79,137 @@ services: interval: 1s retries: 20 + certs: + image: node:18.20.4 + volumes: + - ./test/acceptance/certs:/certs + working_dir: /certs + entrypoint: sh + command: + - '-cex' + - | + if [ ! -f ./certgen ]; then + wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)" + chmod +x ./certgen + fi + if [ ! -f private.key ] || [ ! -f public.crt ]; then + ./certgen -host minio + fi + + minio: + image: minio/minio:RELEASE.2024-10-13T13-34-11Z + command: server /data + volumes: + - ./test/acceptance/certs:/root/.minio/certs + environment: + MINIO_ROOT_USER: MINIO_ROOT_USER + MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD + depends_on: + certs: + condition: service_completed_successfully + + minio_setup: + depends_on: + certs: + condition: service_completed_successfully + minio: + condition: service_started + image: minio/mc:RELEASE.2024-10-08T09-37-26Z + volumes: + - ./test/acceptance/certs:/root/.mc/certs/CAs + entrypoint: sh + command: + - '-cex' + - | + sleep 1 + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \ + || sleep 3 && \ + mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD + mc mb --ignore-existing s3/overleaf-test-history-chunks + mc mb --ignore-existing s3/overleaf-test-history-deks + mc mb --ignore-existing s3/overleaf-test-history-global-blobs + mc mb --ignore-existing s3/overleaf-test-history-project-blobs + mc admin user add s3 \ + OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \ + OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY + echo ' + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-chunks" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-chunks/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-deks" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-deks/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-global-blobs" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-project-blobs" + }, + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*" + } + ] + }' > policy-history.json + + mc admin policy create s3 overleaf-history policy-history.json + mc admin policy attach s3 overleaf-history \ + --user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID gcs: image: fsouza/fake-gcs-server:1.45.2 command: ["--port=9090", "--scheme=http"] diff --git a/services/history-v1/package.json b/services/history-v1/package.json index fcf09e18de..54622ff5ef 100644 --- a/services/history-v1/package.json +++ b/services/history-v1/package.json @@ -6,10 +6,13 @@ "license": "Proprietary", "private": true, "dependencies": { + "@google-cloud/secret-manager": "^5.6.0", "@overleaf/logger": "*", "@overleaf/metrics": "*", + "@overleaf/mongo-utils": "*", "@overleaf/o-error": "*", "@overleaf/object-persistor": "*", + "@overleaf/promise-utils": "*", "@overleaf/redis-wrapper": "*", "@overleaf/settings": "*", "@overleaf/stream-utils": "^0.1.0", diff --git a/services/history-v1/storage/lib/backupPersistor.mjs b/services/history-v1/storage/lib/backupPersistor.mjs new file mode 100644 index 0000000000..959c914371 --- /dev/null +++ b/services/history-v1/storage/lib/backupPersistor.mjs @@ -0,0 +1,107 @@ +// @ts-check +import fs from 'node:fs' +import Path from 'node:path' +import _ from 'lodash' +import config from 'config' +import { SecretManagerServiceClient } from '@google-cloud/secret-manager' +import { + PerProjectEncryptedS3Persistor, + RootKeyEncryptionKey, +} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js' + +const persistorConfig = _.cloneDeep(config.get('backupPersistor')) +const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } = + config.get('backupStore') + +export { chunksBucket, globalBlobsBucket, projectBlobsBucket } + +function convertKey(key, convertFn) { + if (_.has(persistorConfig, key)) { + _.update(persistorConfig, key, convertFn) + } +} + +convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10)) +convertKey('s3SSEC.maxRetries', s => parseInt(s, 10)) +convertKey('s3SSEC.pathStyle', s => s === 'true') +// array of CA, either inlined or on disk +convertKey('s3SSEC.ca', s => + JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca)) +) + +/** @type {() => Promise} */ +let getRawRootKeyEncryptionKeys + +if ((process.env.NODE_ENV || 'production') === 'production') { + ;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = ( + await loadFromSecretsManager( + process.env.BACKUP_AWS_CREDENTIALS || '', + 'BACKUP_AWS_CREDENTIALS' + ) + ).split(':') + getRawRootKeyEncryptionKeys = () => + loadFromSecretsManager( + persistorConfig.keyEncryptionKeys, + 'BACKUP_KEY_ENCRYPTION_KEYS' + ) +} else { + getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys +} + +export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none' +if (DELETION_ONLY) { + // For Backup-deleter; should not encrypt or read data; deleting does not need key. + getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {}) +} + +/** + * @param {string} bucketName + * @param {string} path + * @return {string} + */ +function pathToProjectFolder(bucketName, path) { + switch (bucketName) { + case deksBucket: + case chunksBucket: + case projectBlobsBucket: + return Path.join(...path.split('/').slice(0, 3)) + '/' + default: + throw new Error(`${bucketName} does not store per-project files`) + } +} + +/** + * @param {string} name + * @param {string} label + * @return {Promise} + */ +async function loadFromSecretsManager(name, label) { + const client = new SecretManagerServiceClient() + const [version] = await client.accessSecretVersion({ name }) + if (!version.payload?.data) throw new Error(`empty secret: ${label}`) + return version.payload.data.toString() +} + +async function getRootKeyEncryptionKeys() { + return JSON.parse(await getRawRootKeyEncryptionKeys()).map( + ({ key, salt }) => { + return new RootKeyEncryptionKey( + Buffer.from(key, 'base64'), + Buffer.from(salt, 'base64') + ) + } + ) +} + +export const backupPersistor = new PerProjectEncryptedS3Persistor({ + ...persistorConfig.s3SSEC, + disableMultiPartUpload: true, + dataEncryptionKeyBucketName: deksBucket, + pathToProjectFolder, + getRootKeyEncryptionKeys, + storageClass: { + [deksBucket]: 'STANDARD', + [chunksBucket]: persistorConfig.tieringStorageClass, + [projectBlobsBucket]: persistorConfig.tieringStorageClass, + }, +}) diff --git a/services/history-v1/storage/lib/blob_store/index.js b/services/history-v1/storage/lib/blob_store/index.js index fc9a1ab5d1..89daf60d7b 100644 --- a/services/history-v1/storage/lib/blob_store/index.js +++ b/services/history-v1/storage/lib/blob_store/index.js @@ -337,4 +337,4 @@ class BlobStore { } } -module.exports = { BlobStore, loadGlobalBlobs } +module.exports = { BlobStore, loadGlobalBlobs, makeProjectKey, GLOBAL_BLOBS } diff --git a/services/history-v1/storage/lib/mongodb.js b/services/history-v1/storage/lib/mongodb.js index 53b1837a8f..938e9555c2 100644 --- a/services/history-v1/storage/lib/mongodb.js +++ b/services/history-v1/storage/lib/mongodb.js @@ -10,7 +10,19 @@ const chunks = db.collection('projectHistoryChunks') const blobs = db.collection('projectHistoryBlobs') const globalBlobs = db.collection('projectHistoryGlobalBlobs') const shardedBlobs = db.collection('projectHistoryShardedBlobs') +// Temporary collection for tracking progress of backed up old blobs (without a hash). +// The initial sync process will be able to skip over these. +// Schema: _id: projectId, blobs: [Binary] +const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs') Metrics.mongodb.monitor(client) -module.exports = { client, db, chunks, blobs, globalBlobs, shardedBlobs } +module.exports = { + client, + db, + chunks, + blobs, + globalBlobs, + shardedBlobs, + backedUpBlobs, +} diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs new file mode 100644 index 0000000000..9b73680c94 --- /dev/null +++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs @@ -0,0 +1,824 @@ +// @ts-check +import Crypto from 'node:crypto' +import Events from 'node:events' +import fs from 'node:fs' +import Path from 'node:path' +import Stream from 'node:stream' +import zLib from 'node:zlib' +import { setTimeout } from 'node:timers/promises' +import { Binary, ObjectId } from 'mongodb' +import logger from '@overleaf/logger' +import { + batchedUpdate, + READ_PREFERENCE_SECONDARY, +} from '@overleaf/mongo-utils/batchedUpdate.js' +import OError from '@overleaf/o-error' +import { + AlreadyWrittenError, + NoKEKMatchedError, + NotFoundError, +} from '@overleaf/object-persistor/src/Errors.js' +import { promiseMapWithLimit } from '@overleaf/promise-utils' +import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs' +import { + BlobStore, + GLOBAL_BLOBS, + loadGlobalBlobs, + makeProjectKey, +} from '../lib/blob_store/index.js' +import { backedUpBlobs, db } from '../lib/mongodb.js' +import filestorePersistor from '../lib/persistor.js' + +// Silence warning. +Events.setMaxListeners(20) + +// Enable caching for ObjectId.toString() +ObjectId.cacheHexString = true + +/** + * @typedef {import("overleaf-editor-core").Blob} Blob + * @typedef {import("mongodb").Collection} Collection + * @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor + */ + +/** + * @typedef {Object} FileRef + * @property {ObjectId} _id + * @property {string} hash + */ + +/** + * @typedef {Object} Folder + * @property {Array} folders + * @property {Array} fileRefs + */ + +/** + * @typedef {Object} DeletedFileRef + * @property {ObjectId} _id + * @property {ObjectId} projectId + * @property {string} hash + */ + +/** + * @typedef {Object} Project + * @property {ObjectId} _id + * @property {Array} rootFolder + * @property {Array} deletedFileIds + * @property {{history: {id: string}}} overleaf + */ + +/** + * @typedef {Object} QueueEntry + * @property {ProjectContext} ctx + * @property {string} fileId + * @property {string} path + * @property {string} [hash] + */ + +// Time of closing the ticket for adding hashes: https://github.com/overleaf/internal/issues/464#issuecomment-492668129 +const ALL_PROJECTS_HAVE_FILE_HASHES_AFTER = new Date('2019-05-15T14:02:00Z') +const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') +const BATCH_RANGE_START = + process.env.BATCH_RANGE_START || + ObjectId.createFromTime(PUBLIC_LAUNCH_DATE.getTime() / 1000).toString() +const BATCH_RANGE_END = + process.env.BATCH_RANGE_END || + ObjectId.createFromTime( + ALL_PROJECTS_HAVE_FILE_HASHES_AFTER.getTime() / 1000 + ).toString() +// We need to control the start and end as ids of deleted projects are created at time of deletion. +delete process.env.BATCH_RANGE_START +delete process.env.BATCH_RANGE_END + +// Concurrency for downloading from GCS and updating hashes in mongo +const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10) +// Retries for processing a given file +const RETRIES = parseInt(process.env.RETRIES || '10', 10) +const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10) + +const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || '' +if (!USER_FILES_BUCKET_NAME) { + throw new Error('env var USER_FILES_BUCKET_NAME is missing') +} +const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true' +const BUFFER_DIR = fs.mkdtempSync( + process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-' +) + +const projectsCollection = db.collection('projects') +const deletedProjectsCollection = db.collection('deletedProjects') +const deletedFilesCollection = db.collection('deletedFiles') + +const STATS = { + projects: 0, + filesWithoutHash: 0, + filesDuplicated: 0, + filesRetries: 0, + filesFailed: 0, + fileTreeUpdated: 0, + globalBlobsCount: 0, + globalBlobsEgress: 0, + projectDeleted: 0, + projectHardDeleted: 0, + fileHardDeleted: 0, + mongoUpdates: 0, + deduplicatedWriteToAWSLocalCount: 0, + deduplicatedWriteToAWSLocalEgress: 0, + deduplicatedWriteToAWSRemoteCount: 0, + deduplicatedWriteToAWSRemoteEgress: 0, + writeToAWSCount: 0, + writeToAWSEgress: 0, +} + +function printStats() { + console.log( + JSON.stringify({ + time: new Date(), + ...STATS, + }) + ) +} + +setInterval(printStats, 60_000) + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function processFile(entry) { + for (let attempt = 0; attempt < RETRIES; attempt++) { + try { + return await processFileOnce(entry) + } catch (err) { + if (err instanceof NotFoundError) { + const { bucketName } = OError.getFullInfo(err) + if (bucketName === USER_FILES_BUCKET_NAME && !RETRY_FILESTORE_404) { + throw err // disable retries for not found in filestore bucket case + } + } + if (err instanceof NoKEKMatchedError) { + throw err // disable retries when upload to S3 will fail again + } + STATS.filesRetries++ + const { + ctx: { projectId }, + fileId, + path, + } = entry + logger.warn( + { err, projectId, fileId, path, attempt }, + 'failed to process file, trying again' + ) + await setTimeout(RETRY_DELAY_MS) + } + } + return await processFileOnce(entry) +} + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function processFileOnce(entry) { + const { fileId } = entry + const { projectId, historyId } = entry.ctx + const filePath = Path.join( + BUFFER_DIR, + projectId.toString() + fileId.toString() + ) + const dst = fs.createWriteStream(filePath) + const src = await filestorePersistor.getObjectStream( + USER_FILES_BUCKET_NAME, + `${projectId}/${fileId}` + ) + await Stream.promises.pipeline(src, dst) + + const blobStore = new BlobStore(historyId) + const blob = await blobStore.putFile(filePath) + const hash = blob.getHash() + + if (GLOBAL_BLOBS.has(hash)) { + STATS.globalBlobsCount++ + STATS.globalBlobsEgress += estimateBlobSize(blob) + return hash + } + + if (entry.ctx.hasBackedUpBlob(hash)) { + STATS.deduplicatedWriteToAWSLocalCount++ + STATS.deduplicatedWriteToAWSLocalEgress += estimateBlobSize(blob) + return hash + } + entry.ctx.recordPendingBlob(hash) + + let backupSource + let contentEncoding + const md5 = Crypto.createHash('md5') + let size + if (blob.getStringLength()) { + const filePathCompressed = filePath + '.gz' + backupSource = filePathCompressed + contentEncoding = 'gzip' + size = 0 + await Stream.promises.pipeline( + fs.createReadStream(filePath), + zLib.createGzip(), + async function* (source) { + for await (const chunk of source) { + size += chunk.byteLength + md5.update(chunk) + yield chunk + } + }, + fs.createWriteStream(filePathCompressed) + ) + } else { + backupSource = filePath + size = blob.getByteLength() + await Stream.promises.pipeline(fs.createReadStream(filePath), md5) + } + const backendKeyPath = makeProjectKey(historyId, blob.getHash()) + const persistor = await entry.ctx.getCachedPersistor(backendKeyPath) + try { + STATS.writeToAWSCount++ + await persistor.sendStream( + projectBlobsBucket, + backendKeyPath, + fs.createReadStream(backupSource), + { + contentEncoding, + contentType: 'application/octet-stream', + contentLength: size, + sourceMd5: md5.digest('hex'), + ifNoneMatch: '*', // de-duplicate write (we pay for the request, but avoid egress) + } + ) + STATS.writeToAWSEgress += size + } catch (err) { + if (err instanceof AlreadyWrittenError) { + STATS.deduplicatedWriteToAWSRemoteCount++ + STATS.deduplicatedWriteToAWSRemoteEgress += size + } else { + STATS.writeToAWSEgress += size + entry.ctx.recordFailedBlob(hash) + throw err + } + } + entry.ctx.recordBackedUpBlob(hash) + return hash +} + +/** + * @param {Array} files + * @return {Promise} + */ +async function processFiles(files) { + if (files.length === 0) return // all processed + await fs.promises.mkdir(BUFFER_DIR, { recursive: true }) + try { + await promiseMapWithLimit( + CONCURRENCY, + files, + /** + * @param {QueueEntry} entry + * @return {Promise} + */ + async function (entry) { + try { + await entry.ctx.processFile(entry) + } catch (err) { + STATS.filesFailed++ + const { + ctx: { projectId }, + fileId, + path, + } = entry + logger.error( + { err, projectId, fileId, path }, + 'failed to process file' + ) + } + } + ) + } finally { + await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true }) + } +} + +/** + * @param {Array} batch + * @param {string} prefix + * @return {Promise} + */ +async function handleLiveTreeBatch(batch, prefix = 'rootFolder.0') { + if (process.argv.includes('deletedFiles')) { + await collectDeletedFiles(batch) + } + const files = Array.from(findFileInBatch(batch, prefix)) + STATS.projects += batch.length + STATS.filesWithoutHash += files.length + batch.length = 0 // GC + // The files are currently ordered by project-id. + // Order them by file-id to + // - avoid head-of-line blocking from many project-files waiting on the generation of the projects DEK (round trip to AWS) + // - bonus: increase chance of de-duplicating write to AWS + files.sort((a, b) => (a.fileId > b.fileId ? 1 : -1)) + await processFiles(files) + await promiseMapWithLimit( + CONCURRENCY, + files, + /** + * @param {QueueEntry} entry + * @return {Promise} + */ + async function (entry) { + await entry.ctx.flushMongoQueues() + } + ) +} + +/** + * @param {Array<{project: Project}>} batch + * @return {Promise} + */ +async function handleDeletedFileTreeBatch(batch) { + await handleLiveTreeBatch( + batch.map(d => d.project), + 'project.rootFolder.0' + ) +} + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function tryUpdateFileRefInMongo(entry) { + if (entry.path === '') { + return await tryUpdateDeletedFileRefInMongo(entry) + } else if (entry.path.startsWith('project.')) { + return await tryUpdateFileRefInMongoInDeletedProject(entry) + } + + STATS.mongoUpdates++ + const result = await projectsCollection.updateOne( + { + _id: entry.ctx.projectId, + [`${entry.path}._id`]: new ObjectId(entry.fileId), + }, + { + $set: { [`${entry.path}.hash`]: entry.hash }, + } + ) + return result.matchedCount === 1 +} + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function tryUpdateDeletedFileRefInMongo(entry) { + STATS.mongoUpdates++ + const result = await deletedFilesCollection.updateOne( + { + _id: new ObjectId(entry.fileId), + projectId: entry.ctx.projectId, + }, + { $set: { hash: entry.hash } } + ) + return result.matchedCount === 1 +} + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function tryUpdateFileRefInMongoInDeletedProject(entry) { + STATS.mongoUpdates++ + const result = await deletedProjectsCollection.updateOne( + { + 'deleterData.deletedProjectId': entry.ctx.projectId, + [`${entry.path}._id`]: new ObjectId(entry.fileId), + }, + { + $set: { [`${entry.path}.hash`]: entry.hash }, + } + ) + return result.matchedCount === 1 +} + +const RETRY_UPDATE_HASH = 100 + +/** + * @param {QueueEntry} entry + * @return {Promise} + */ +async function updateFileRefInMongo(entry) { + if (await tryUpdateFileRefInMongo(entry)) return + + const { fileId } = entry + const { projectId } = entry.ctx + for (let i = 0; i < RETRY_UPDATE_HASH; i++) { + let prefix = 'rootFolder.0' + let p = await projectsCollection.findOne( + { _id: projectId }, + { projection: { rootFolder: 1 } } + ) + if (!p) { + STATS.projectDeleted++ + prefix = 'project.rootFolder.0' + const deletedProject = await deletedProjectsCollection.findOne( + { + 'deleterData.deletedProjectId': projectId, + project: { $exists: true }, + }, + { projection: { 'project.rootFolder': 1 } } + ) + p = deletedProject?.project + if (!p) { + STATS.projectHardDeleted++ + console.warn( + 'bug: project hard-deleted while processing', + projectId, + fileId + ) + return + } + } + let found = false + for (const e of findFiles(entry.ctx, p.rootFolder[0], prefix)) { + found = e.fileId === fileId + if (!found) continue + if (await tryUpdateFileRefInMongo(e)) return + break + } + if (!found) { + if (await tryUpdateDeletedFileRefInMongo(entry)) return + STATS.fileHardDeleted++ + console.warn('bug: file hard-deleted while processing', projectId, fileId) + return + } + + STATS.fileTreeUpdated++ + } + throw new OError( + 'file-tree updated repeatedly while trying to add hash', + entry + ) +} + +/** + * @param {ProjectContext} ctx + * @param {Folder} folder + * @param {string} path + * @return Generator + */ +function* findFiles(ctx, folder, path) { + let i = 0 + for (const child of folder.folders) { + yield* findFiles(ctx, child, `${path}.folders.${i}`) + i++ + } + i = 0 + for (const fileRef of folder.fileRefs) { + if (!fileRef.hash) { + yield { + ctx, + fileId: fileRef._id.toString(), + path: `${path}.fileRefs.${i}`, + } + } + i++ + } +} + +/** + * @param {Array} projects + * @param {string} prefix + */ +function* findFileInBatch(projects, prefix) { + for (const project of projects) { + const ctx = new ProjectContext(project) + yield* findFiles(ctx, project.rootFolder[0], prefix) + for (const fileId of project.deletedFileIds || []) { + yield { ctx, fileId, path: '' } + } + } +} + +/** + * @param {Array} projects + * @return {Promise} + */ +async function collectDeletedFiles(projects) { + const cursor = deletedFilesCollection.find( + { + projectId: { $in: projects.map(p => p._id) }, + hash: { $exists: false }, + }, + { + projection: { _id: 1, projectId: 1 }, + readPreference: READ_PREFERENCE_SECONDARY, + sort: { projectId: 1 }, + } + ) + const processed = projects.slice() + for await (const deletedFileRef of cursor) { + const idx = processed.findIndex( + p => p._id.toString() === deletedFileRef.projectId.toString() + ) + if (idx === -1) { + throw new Error( + `bug: order of deletedFiles mongo records does not match batch of projects (${deletedFileRef.projectId} out of order)` + ) + } + processed.splice(0, idx) + const project = processed[0] + project.deletedFileIds = project.deletedFileIds || [] + project.deletedFileIds.push(deletedFileRef._id.toString()) + } +} + +const BATCH_HASH_WRITES = 1_000 +const BATCH_FILE_UPDATES = 100 + +class ProjectContext { + /** @type {Promise | null} */ + #cachedPersistorPromise = null + + /** + * @param {Project} project + */ + constructor(project) { + this.projectId = project._id + this.historyId = project.overleaf.history.id.toString() + } + + /** + * @param {string} key + * @return {Promise} + */ + getCachedPersistor(key) { + if (!this.#cachedPersistorPromise) { + // Fetch DEK once, but only if needed -- upon the first use + this.#cachedPersistorPromise = this.#getCachedPersistorWithRetries(key) + } + return this.#cachedPersistorPromise + } + + /** + * @param {string} key + * @return {Promise} + */ + async #getCachedPersistorWithRetries(key) { + for (let attempt = 0; attempt < RETRIES; attempt++) { + try { + return await backupPersistor.forProject(projectBlobsBucket, key) + } catch (err) { + if (err instanceof NoKEKMatchedError) { + throw err + } else { + logger.warn( + { err, projectId: this.projectId, attempt }, + 'failed to get DEK, trying again' + ) + await setTimeout(RETRY_DELAY_MS) + } + } + } + return await backupPersistor.forProject(projectBlobsBucket, key) + } + + async flushMongoQueuesIfNeeded() { + if (this.#completedBlobs.size > BATCH_HASH_WRITES) { + await this.#storeBackedUpBlobs() + } + if (this.#pendingFileWrites.length > BATCH_FILE_UPDATES) { + await this.#storeFileHashes() + } + } + + async flushMongoQueues() { + await this.#storeBackedUpBlobs() + await this.#storeFileHashes() + } + + /** @type {Set} */ + #pendingBlobs = new Set() + /** @type {Set} */ + #completedBlobs = new Set() + + async #storeBackedUpBlobs() { + if (this.#completedBlobs.size === 0) return + const blobs = Array.from(this.#completedBlobs).map( + hash => new Binary(Buffer.from(hash, 'hex')) + ) + this.#completedBlobs.clear() + STATS.mongoUpdates++ + await backedUpBlobs.updateOne( + { _id: this.projectId }, + { $addToSet: { blobs: { $each: blobs } } }, + { upsert: true } + ) + } + + /** + * @param {string} hash + */ + recordPendingBlob(hash) { + this.#pendingBlobs.add(hash) + } + + /** + * @param {string} hash + */ + recordFailedBlob(hash) { + this.#pendingBlobs.delete(hash) + } + + /** + * @param {string} hash + */ + recordBackedUpBlob(hash) { + this.#completedBlobs.add(hash) + this.#pendingBlobs.delete(hash) + } + + /** + * @param {string} hash + * @return {boolean} + */ + hasBackedUpBlob(hash) { + return this.#pendingBlobs.has(hash) || this.#completedBlobs.has(hash) + } + + /** @type {Array} */ + #pendingFileWrites = [] + + /** + * @param {QueueEntry} entry + */ + queueFileForWritingHash(entry) { + this.#pendingFileWrites.push(entry) + } + + /** + * @param {Collection} collection + * @param {Array} entries + * @param {Object} query + * @return {Promise>} + */ + async #tryBatchHashWrites(collection, entries, query) { + if (entries.length === 0) return [] + const update = {} + for (const entry of entries) { + query[`${entry.path}._id`] = new ObjectId(entry.fileId) + update[`${entry.path}.hash`] = entry.hash + } + STATS.mongoUpdates++ + const result = await collection.updateOne(query, { $set: update }) + if (result.matchedCount === 1) { + return [] // all updated + } + return entries + } + + async #storeFileHashes() { + if (this.#pendingFileWrites.length === 0) return + const individualUpdates = [] + const projectEntries = [] + const deletedProjectEntries = [] + for (const entry of this.#pendingFileWrites) { + if (entry.path === '') { + individualUpdates.push(entry) + } else if (entry.path.startsWith('project.')) { + deletedProjectEntries.push(entry) + } else { + projectEntries.push(entry) + } + } + this.#pendingFileWrites.length = 0 + + // Try to process them together, otherwise fallback to individual updates and retries. + individualUpdates.push( + ...(await this.#tryBatchHashWrites(projectsCollection, projectEntries, { + _id: this.projectId, + })) + ) + individualUpdates.push( + ...(await this.#tryBatchHashWrites( + deletedProjectsCollection, + deletedProjectEntries, + { 'deleterData.deletedProjectId': this.projectId } + )) + ) + for (const entry of individualUpdates) { + await updateFileRefInMongo(entry) + } + } + + /** @type {Map>} */ + #pendingFiles = new Map() + + /** + * @param {QueueEntry} entry + */ + async processFile(entry) { + if (this.#pendingFiles.has(entry.fileId)) { + STATS.filesDuplicated++ + } else { + this.#pendingFiles.set(entry.fileId, processFile(entry)) + } + entry.hash = await this.#pendingFiles.get(entry.fileId) + this.queueFileForWritingHash(entry) + await this.flushMongoQueuesIfNeeded() + } +} + +/** + * @param {Blob} blob + * @return {number} + */ +function estimateBlobSize(blob) { + let size = blob.getByteLength() + if (blob.getStringLength()) { + // approximation for gzip (25 bytes gzip overhead and 20% compression ratio) + size = 25 + Math.ceil(size * 0.2) + } + return size +} + +async function updateLiveFileTrees() { + await batchedUpdate( + projectsCollection, + { 'overleaf.history.id': { $exists: true } }, + handleLiveTreeBatch, + { rootFolder: 1, _id: 1, 'overleaf.history.id': 1 }, + {}, + { + BATCH_RANGE_START, + BATCH_RANGE_END, + } + ) + console.warn('Done updating live projects') +} + +async function updateDeletedFileTrees() { + await batchedUpdate( + deletedProjectsCollection, + { + 'deleterData.deletedProjectId': { + $gt: new ObjectId(BATCH_RANGE_START), + $lte: new ObjectId(BATCH_RANGE_END), + }, + 'project.overleaf.history.id': { $exists: true }, + }, + handleDeletedFileTreeBatch, + { + 'project.rootFolder': 1, + 'project._id': 1, + 'project.overleaf.history.id': 1, + } + ) + console.warn('Done updating deleted projects') +} + +async function main() { + await loadGlobalBlobs() + if (process.argv.includes('live')) { + await updateLiveFileTrees() + } + if (process.argv.includes('deleted')) { + await updateDeletedFileTrees() + } + console.warn('Done.') +} + +try { + try { + await main() + } finally { + printStats() + } + + let code = 0 + if (STATS.filesFailed > 0) { + console.warn('Some files could not be processed, see logs and try again') + code++ + } + if (STATS.fileHardDeleted > 0) { + console.warn( + 'Some hashes could not be updated as the files were hard-deleted, this should not happen' + ) + code++ + } + if (STATS.projectHardDeleted > 0) { + console.warn( + 'Some hashes could not be updated as the project was hard-deleted, this should not happen' + ) + code++ + } + process.exit(code) +} catch (err) { + console.error(err) + process.exit(1) +} diff --git a/services/history-v1/test/acceptance/certs/.gitignore b/services/history-v1/test/acceptance/certs/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/services/history-v1/test/acceptance/certs/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs new file mode 100644 index 0000000000..9281cefc2d --- /dev/null +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs @@ -0,0 +1,1009 @@ +import Crypto from 'node:crypto' +import Stream from 'node:stream' +import { setTimeout } from 'node:timers/promises' +import { promisify } from 'node:util' +import { ObjectId, Binary } from 'mongodb' +import { + db, + backedUpBlobs, + globalBlobs, +} from '../../../../storage/lib/mongodb.js' +import cleanup from './support/cleanup.js' +import testProjects from '../api/support/test_projects.js' +import { execFile } from 'node:child_process' +import { expect } from 'chai' +import config from 'config' +import ObjectPersistor from '@overleaf/object-persistor' +import { WritableBuffer } from '@overleaf/stream-utils' +import { + backupPersistor, + projectBlobsBucket, +} from '../../../../storage/lib/backupPersistor.mjs' +import projectKey from '../../../../storage/lib/project_key.js' +import { + BlobStore, + makeProjectKey, +} from '../../../../storage/lib/blob_store/index.js' + +const TIMEOUT = 20 * 1_000 + +const { deksBucket } = config.get('backupStore') +const { tieringStorageClass } = config.get('backupPersistor') + +const projectsCollection = db.collection('projects') +const deletedProjectsCollection = db.collection('deletedProjects') +const deletedFilesCollection = db.collection('deletedFiles') + +const FILESTORE_PERSISTOR = ObjectPersistor({ + backend: 'gcs', + gcs: { + endpoint: { + apiEndpoint: process.env.GCS_API_ENDPOINT, + projectId: process.env.GCS_PROJECT_ID, + }, + }, +}) + +/** + * @param {ObjectId} objectId + * @return {string} + */ +function gitBlobHash(objectId) { + return gitBlobHashBuffer(Buffer.from(objectId.toString())) +} + +/** + * @param {Buffer} buf + * @return {string} + */ +function gitBlobHashBuffer(buf) { + const sha = Crypto.createHash('sha1') + sha.update(`blob ${buf.byteLength}\x00`) + sha.update(buf) + return sha.digest('hex') +} + +/** + * @param {string} gitBlobHash + * @return {Binary} + */ +function binaryForGitBlobHash(gitBlobHash) { + return new Binary(Buffer.from(gitBlobHash, 'hex')) +} + +async function listS3Bucket(bucket, wantStorageClass) { + const client = backupPersistor._getClientForBucket(bucket) + const response = await client.listObjectsV2({ Bucket: bucket }).promise() + + for (const object of response.Contents || []) { + expect(object).to.have.property('StorageClass', wantStorageClass) + } + + return (response.Contents || []).map(item => item.Key || '') +} + +function objectIdFromTime(timestamp) { + return ObjectId.createFromTime(new Date(timestamp).getTime() / 1000) +} + +const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false + +describe('back_fill_file_hash script', function () { + this.timeout(TIMEOUT) + const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs' + + const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z') + const projectId1 = objectIdFromTime('2017-01-01T00:01:00Z') + const projectId2 = objectIdFromTime('2017-01-01T00:02:00Z') + const projectId3 = objectIdFromTime('2024-01-01T00:03:00Z') + const projectIdDeleted0 = objectIdFromTime('2017-01-01T00:04:00Z') + const projectIdDeleted1 = objectIdFromTime('2024-01-01T00:05:00Z') + const projectIdNoHistory = objectIdFromTime('2017-01-01T00:06:00Z') + const projectIdNoHistoryDeleted = objectIdFromTime('2017-01-01T00:07:00Z') + const projectIdHardDeleted = objectIdFromTime('2017-01-01T00:08:00Z') + const projectIdNoOverleaf = objectIdFromTime('2017-01-01T00:09:00Z') + const projectIdNoOverleafDeleted = objectIdFromTime('2017-01-01T00:10:00Z') + const historyId0 = 42 // stored as number is mongo + const historyId1 = projectId1.toString() + const historyId2 = projectId2.toString() + const historyId3 = projectId3.toString() + const historyIdDeleted0 = projectIdDeleted0.toString() + const historyIdDeleted1 = projectIdDeleted1.toString() + const fileId0 = objectIdFromTime('2017-02-01T00:00:00Z') + const fileId1 = objectIdFromTime('2017-02-01T00:01:00Z') + const fileId2 = objectIdFromTime('2017-02-01T00:02:00Z') + const fileId3 = objectIdFromTime('2017-02-01T00:03:00Z') + const fileId4 = objectIdFromTime('2017-02-01T00:04:00Z') + const fileId5 = objectIdFromTime('2024-02-01T00:05:00Z') + const fileId6 = objectIdFromTime('2017-02-01T00:06:00Z') + const fileId7 = objectIdFromTime('2017-02-01T00:07:00Z') + const fileIdDeleted1 = objectIdFromTime('2017-02-01T00:07:00Z') + const fileIdDeleted2 = objectIdFromTime('2017-02-01T00:08:00Z') + const fileIdDeleted3 = objectIdFromTime('2017-02-01T00:09:00Z') + const fileIdDeleted4 = objectIdFromTime('2024-02-01T00:10:00Z') + const fileIdDeleted5 = objectIdFromTime('2024-02-01T00:11:00Z') + const deleteProjectsRecordId0 = new ObjectId() + const deleteProjectsRecordId1 = new ObjectId() + const deleteProjectsRecordId2 = new ObjectId() + const deleteProjectsRecordId3 = new ObjectId() + const deleteProjectsRecordId4 = new ObjectId() + const contentFile7 = Buffer.alloc(11_000_000) + const hashFile7 = gitBlobHashBuffer(contentFile7) + const writtenBlobs = [ + { projectId: projectId0, historyId: historyId0, fileId: fileId0 }, + // { historyId: projectId0, fileId: fileId6 }, // global blob + { + projectId: projectId0, + historyId: historyId0, + fileId: fileId7, + hash: hashFile7, + }, + { projectId: projectId0, historyId: historyId0, fileId: fileIdDeleted5 }, + { projectId: projectId1, historyId: historyId1, fileId: fileId1 }, + { projectId: projectId1, historyId: historyId1, fileId: fileIdDeleted1 }, + // { historyId: historyId2, fileId: fileId2 }, // already has hash + // { historyId: historyId3, fileId: fileId3 }, // too new + { + projectId: projectIdDeleted0, + historyId: historyIdDeleted0, + fileId: fileId4, + }, + { + projectId: projectIdDeleted0, + historyId: historyIdDeleted0, + fileId: fileIdDeleted2, + }, + // { historyId: historyIdDeleted0, fileId:fileIdDeleted3 }, // fileIdDeleted3 is dupe of fileIdDeleted2 + // { historyId: historyIdDeleted0, fileId: fileIdDeleted4 }, // already has hash + ] + if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) { + const fileIds = { + fileId0, + fileId1, + fileId2, + fileId3, + fileId4, + fileId5, + fileId6, + fileIdDeleted1, + fileIdDeleted2, + fileIdDeleted3, + fileIdDeleted4, + } + console.log({ + projectId0, + projectId1, + projectId2, + projectId3, + projectIdDeleted0, + projectIdDeleted1, + historyId0, + historyId1, + historyId2, + historyId3, + historyIdDeleted0, + historyIdDeleted1, + ...fileIds, + }) + for (const [name, v] of Object.entries(fileIds)) { + console.log(name, gitBlobHash(v)) + } + } + + beforeEach(cleanup.everything) + beforeEach('cleanup s3 buckets', async function () { + await backupPersistor.deleteDirectory(deksBucket, '') + await backupPersistor.deleteDirectory(projectBlobsBucket, '') + expect(await listS3Bucket(deksBucket)).to.have.length(0) + expect(await listS3Bucket(projectBlobsBucket)).to.have.length(0) + }) + + beforeEach('populate mongo', async function () { + await globalBlobs.insertMany([ + { _id: gitBlobHash(fileId6), byteLength: 24, stringLength: 24 }, + ]) + await projectsCollection.insertMany([ + { + _id: projectId0, + rootFolder: [ + { + fileRefs: [{ _id: fileId0 }, { _id: fileId6 }, { _id: fileId7 }], + folders: [{ fileRefs: [], folders: [] }], + }, + ], + overleaf: { history: { id: historyId0 } }, + }, + { + _id: projectId1, + rootFolder: [ + { + fileRefs: [{ _id: fileId1 }], + folders: [ + { + fileRefs: [], + folders: [{ fileRefs: [{ _id: fileId1 }], folders: [] }], + }, + ], + }, + ], + overleaf: { history: { id: historyId1 } }, + }, + { + _id: projectId2, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId2, hash: gitBlobHash(fileId2) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyId2 } }, + }, + { + _id: projectId3, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId3, hash: gitBlobHash(fileId3) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyId3 } }, + }, + { + _id: projectIdNoHistory, + rootFolder: [{ fileRefs: [], folders: [] }], + overleaf: { history: { conversionFailed: true } }, + }, + { + _id: projectIdNoOverleaf, + rootFolder: [{ fileRefs: [], folders: [] }], + }, + ]) + await deletedProjectsCollection.insertMany([ + { + _id: deleteProjectsRecordId0, + project: { + _id: projectIdDeleted0, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [{ fileRefs: [{ _id: fileId4 }], folders: [] }], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted0 } }, + }, + deleterData: { + deletedProjectId: projectIdDeleted0, + }, + }, + { + _id: deleteProjectsRecordId1, + project: { + _id: projectIdDeleted1, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId5, hash: gitBlobHash(fileId5) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted1 } }, + }, + deleterData: { + deletedProjectId: projectIdDeleted1, + }, + }, + { + _id: deleteProjectsRecordId2, + project: { + _id: projectIdNoHistoryDeleted, + rootFolder: [{ fileRefs: [], folders: [] }], + overleaf: { history: { conversionFailed: true } }, + }, + deleterData: { + deletedProjectId: projectIdNoHistoryDeleted, + }, + }, + { + _id: deleteProjectsRecordId3, + deleterData: { deletedProjectId: projectIdHardDeleted }, + }, + { + _id: deleteProjectsRecordId4, + project: { + _id: projectIdNoOverleafDeleted, + rootFolder: [{ fileRefs: [], folders: [] }], + }, + deleterData: { + deletedProjectId: projectIdNoOverleafDeleted, + }, + }, + ]) + await deletedFilesCollection.insertMany([ + { _id: fileIdDeleted1, projectId: projectId1 }, + { _id: fileIdDeleted2, projectId: projectIdDeleted0 }, + { _id: fileIdDeleted3, projectId: projectIdDeleted0 }, + { + _id: fileIdDeleted4, + projectId: projectIdDeleted0, + hash: gitBlobHash(fileIdDeleted4), + }, + { _id: fileIdDeleted5, projectId: projectId0 }, + ]) + + await testProjects.createEmptyProject(historyId0.toString()) + await testProjects.createEmptyProject(historyId1) + await testProjects.createEmptyProject(historyId2) + await testProjects.createEmptyProject(historyId3) + await testProjects.createEmptyProject(historyIdDeleted0) + await testProjects.createEmptyProject(historyIdDeleted1) + }) + + beforeEach('populate filestore', async function () { + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId0}`, + Stream.Readable.from([fileId0.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId6}`, + Stream.Readable.from([fileId6.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId7}`, + Stream.Readable.from([contentFile7]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileIdDeleted5}`, + Stream.Readable.from([fileIdDeleted5.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId1}/${fileId1}`, + Stream.Readable.from([fileId1.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId2}/${fileId2}`, + Stream.Readable.from([fileId2.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId3}/${fileId3}`, + Stream.Readable.from([fileId3.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectIdDeleted0}/${fileId4}`, + Stream.Readable.from([fileId4.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectIdDeleted1}/${fileId5}`, + Stream.Readable.from([fileId5.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId1}/${fileIdDeleted1}`, + Stream.Readable.from([fileIdDeleted1.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectIdDeleted0}/${fileIdDeleted2}`, + Stream.Readable.from([fileIdDeleted2.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectIdDeleted0}/${fileIdDeleted3}`, + // same content as 2, deduplicate + Stream.Readable.from([fileIdDeleted2.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectIdDeleted0}/${fileIdDeleted4}`, + Stream.Readable.from([fileIdDeleted4.toString()]) + ) + }) + + async function tryRunScript(env = {}) { + let result + try { + result = await promisify(execFile)( + process.argv0, + [ + 'storage/scripts/back_fill_file_hash.mjs', + 'live', + 'deleted', + 'deletedFiles', + ], + { + encoding: 'utf-8', + timeout: TIMEOUT - 500, + env: { + ...process.env, + USER_FILES_BUCKET_NAME, + ...env, + LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests + }, + } + ) + result.status = 0 + } catch (err) { + const { stdout, stderr, code } = err + if (typeof code !== 'number') { + console.log(err) + } + result = { stdout, stderr, status: code } + } + const stats = JSON.parse(result.stdout.trimEnd().split('\n').pop()) + expect(new Date(stats.time).toISOString()).to.equal(stats.time) + delete stats.time + return { stats, result } + } + + async function runScript(env = {}) { + const { stats, result } = await tryRunScript(env) + if (result.status !== 0) { + console.log(result) + expect(result).to.have.property('status', 0) + } + return { stats, result } + } + + function commonAssertions() { + it('should update mongo', async function () { + expect(await projectsCollection.find({}).toArray()).to.deep.equal([ + { + _id: projectId0, + rootFolder: [ + { + fileRefs: [ + { _id: fileId0, hash: gitBlobHash(fileId0) }, + { _id: fileId6, hash: gitBlobHash(fileId6) }, + { _id: fileId7, hash: hashFile7 }, + ], + folders: [{ fileRefs: [], folders: [] }], + }, + ], + overleaf: { history: { id: historyId0 } }, + }, + { + _id: projectId1, + rootFolder: [ + { + fileRefs: [{ _id: fileId1, hash: gitBlobHash(fileId1) }], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId1, hash: gitBlobHash(fileId1) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyId1 } }, + }, + { + _id: projectId2, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId2, hash: gitBlobHash(fileId2) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyId2 } }, + }, + { + _id: projectId3, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [{ _id: fileId3, hash: gitBlobHash(fileId3) }], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyId3 } }, + }, + { + _id: projectIdNoHistory, + rootFolder: [{ fileRefs: [], folders: [] }], + overleaf: { history: { conversionFailed: true } }, + }, + { + _id: projectIdNoOverleaf, + rootFolder: [{ fileRefs: [], folders: [] }], + }, + ]) + expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([ + { + _id: deleteProjectsRecordId0, + project: { + _id: projectIdDeleted0, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [ + { _id: fileId4, hash: gitBlobHash(fileId4) }, + ], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted0 } }, + }, + deleterData: { + deletedProjectId: projectIdDeleted0, + }, + }, + { + _id: deleteProjectsRecordId1, + project: { + _id: projectIdDeleted1, + rootFolder: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [ + { _id: fileId5, hash: gitBlobHash(fileId5) }, + ], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted1 } }, + }, + deleterData: { + deletedProjectId: projectIdDeleted1, + }, + }, + { + _id: deleteProjectsRecordId2, + project: { + _id: projectIdNoHistoryDeleted, + rootFolder: [{ fileRefs: [], folders: [] }], + overleaf: { history: { conversionFailed: true } }, + }, + deleterData: { + deletedProjectId: projectIdNoHistoryDeleted, + }, + }, + { + _id: deleteProjectsRecordId3, + deleterData: { deletedProjectId: projectIdHardDeleted }, + }, + { + _id: deleteProjectsRecordId4, + project: { + _id: projectIdNoOverleafDeleted, + rootFolder: [{ fileRefs: [], folders: [] }], + }, + deleterData: { + deletedProjectId: projectIdNoOverleafDeleted, + }, + }, + ]) + expect(await deletedFilesCollection.find({}).toArray()).to.deep.equal([ + { + _id: fileIdDeleted1, + projectId: projectId1, + hash: gitBlobHash(fileIdDeleted1), + }, + { + _id: fileIdDeleted2, + projectId: projectIdDeleted0, + hash: gitBlobHash(fileIdDeleted2), + }, + { + _id: fileIdDeleted3, + projectId: projectIdDeleted0, + // uses the same content as fileIdDeleted2 + hash: gitBlobHash(fileIdDeleted2), + }, + { + _id: fileIdDeleted4, + projectId: projectIdDeleted0, + hash: gitBlobHash(fileIdDeleted4), + }, + { + _id: fileIdDeleted5, + projectId: projectId0, + hash: gitBlobHash(fileIdDeleted5), + }, + ]) + expect( + (await backedUpBlobs.find({}, { sort: { _id: 1 } }).toArray()).map( + entry => { + // blobs are pushed unordered into mongo. Sort the list for consistency. + entry.blobs.sort() + return entry + } + ) + ).to.deep.equal([ + { + _id: projectId0, + blobs: [ + binaryForGitBlobHash(gitBlobHash(fileId0)), + binaryForGitBlobHash(hashFile7), + binaryForGitBlobHash(gitBlobHash(fileIdDeleted5)), + ].sort(), + }, + { + _id: projectId1, + blobs: [ + binaryForGitBlobHash(gitBlobHash(fileId1)), + binaryForGitBlobHash(gitBlobHash(fileIdDeleted1)), + ].sort(), + }, + { + _id: projectIdDeleted0, + blobs: [ + binaryForGitBlobHash(gitBlobHash(fileId4)), + binaryForGitBlobHash(gitBlobHash(fileIdDeleted2)), + ].sort(), + }, + ]) + }) + it('should process nothing on re-run', async function () { + const rerun = await runScript() + expect(rerun.stats).deep.equal({ + ...STATS_ALL_ZERO, + // We still need to iterate over all the projects. + projects: 4, + }) + }) + it('should have backed up all the files', async function () { + expect(tieringStorageClass).to.exist + const blobs = await listS3Bucket(projectBlobsBucket, tieringStorageClass) + expect(blobs.sort()).to.deep.equal( + writtenBlobs + .map(({ historyId, fileId, hash }) => + makeProjectKey(historyId, hash || gitBlobHash(fileId)) + ) + .sort() + ) + for (let { historyId, fileId, hash } of writtenBlobs) { + hash = hash || gitBlobHash(fileId.toString()) + const s = await backupPersistor.getObjectStream( + projectBlobsBucket, + makeProjectKey(historyId, hash), + { autoGunzip: true } + ) + const buf = new WritableBuffer() + await Stream.promises.pipeline(s, buf) + expect(gitBlobHashBuffer(buf.getContents())).to.equal(hash) + if (fileId !== fileId7) { + const id = buf.getContents().toString('utf-8') + expect(id).to.equal(fileId.toString()) + // double check we are not comparing 'undefined' or '[object Object]' above + expect(id).to.match(/^[a-f0-9]{24}$/) + } + } + const deks = await listS3Bucket(deksBucket, 'STANDARD') + expect(deks.sort()).to.deep.equal( + Array.from( + new Set( + writtenBlobs.map( + ({ historyId }) => projectKey.format(historyId) + '/dek' + ) + ) + ).sort() + ) + }) + it('should have written the back filled files to history v1', async function () { + for (const { historyId, fileId } of writtenBlobs) { + const blobStore = new BlobStore(historyId.toString()) + if (fileId === fileId7) { + const s = await blobStore.getStream(hashFile7) + const buf = new WritableBuffer() + await Stream.promises.pipeline(s, buf) + expect(buf.getContents()).to.deep.equal(contentFile7) + continue + } + const id = await blobStore.getString(gitBlobHash(fileId.toString())) + expect(id).to.equal(fileId.toString()) + // double check we are not comparing 'undefined' or '[object Object]' above + expect(id).to.match(/^[a-f0-9]{24}$/) + } + }) + } + + function expectNotFoundError(result, msg) { + expect(result.stdout).to.include(msg) + const log = JSON.parse( + result.stdout.split('\n').find(l => l.includes(`"${msg}"`)) + ) + expect(log).to.contain({ + projectId: projectId0.toString(), + fileId: fileId0.toString(), + path: 'rootFolder.0.fileRefs.0', + msg, + }) + expect(log.err).to.contain({ + name: 'NotFoundError', + }) + } + + const STATS_ALL_ZERO = { + projects: 0, + filesWithoutHash: 0, + filesDuplicated: 0, + filesRetries: 0, + filesFailed: 0, + globalBlobsCount: 0, + globalBlobsEgress: 0, + fileTreeUpdated: 0, + projectDeleted: 0, + projectHardDeleted: 0, + fileHardDeleted: 0, + mongoUpdates: 0, + deduplicatedWriteToAWSLocalCount: 0, + deduplicatedWriteToAWSLocalEgress: 0, + deduplicatedWriteToAWSRemoteCount: 0, + deduplicatedWriteToAWSRemoteEgress: 0, + writeToAWSCount: 0, + writeToAWSEgress: 0, + } + const STATS_UP_TO_PROJECT1 = { + projects: 2, + filesWithoutHash: 7, + filesDuplicated: 1, + filesRetries: 0, + filesFailed: 0, + globalBlobsCount: 1, + globalBlobsEgress: 30, + fileTreeUpdated: 0, + projectDeleted: 0, + projectHardDeleted: 0, + fileHardDeleted: 0, + mongoUpdates: 6, + deduplicatedWriteToAWSLocalCount: 0, + deduplicatedWriteToAWSLocalEgress: 0, + deduplicatedWriteToAWSRemoteCount: 0, + deduplicatedWriteToAWSRemoteEgress: 0, + writeToAWSCount: 5, + writeToAWSEgress: 11000118, + } + const STATS_UP_FROM_PROJECT1_ONWARD = { + projects: 2, + filesWithoutHash: 3, + filesDuplicated: 0, + filesRetries: 0, + filesFailed: 0, + globalBlobsCount: 0, + globalBlobsEgress: 0, + fileTreeUpdated: 0, + projectDeleted: 0, + projectHardDeleted: 0, + fileHardDeleted: 0, + mongoUpdates: 4, + deduplicatedWriteToAWSLocalCount: 1, + deduplicatedWriteToAWSLocalEgress: 30, + deduplicatedWriteToAWSRemoteCount: 0, + deduplicatedWriteToAWSRemoteEgress: 0, + writeToAWSCount: 2, + writeToAWSEgress: 58, + } + + function sumStats(a, b) { + return Object.fromEntries(Object.entries(a).map(([k, v]) => [k, v + b[k]])) + } + + const STATS_ALL = sumStats( + STATS_UP_TO_PROJECT1, + STATS_UP_FROM_PROJECT1_ONWARD + ) + + it('should gracefully handle fatal errors', async function () { + await FILESTORE_PERSISTOR.deleteObject( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId0}` + ) + const t0 = Date.now() + const { stats, result } = await tryRunScript({ + RETRIES: '10', + RETRY_DELAY_MS: '1000', + }) + const t1 = Date.now() + expectNotFoundError(result, 'failed to process file') + expect(result.status).to.equal(1) + expect(stats).to.deep.equal( + sumStats(STATS_ALL, { + ...STATS_ALL_ZERO, + filesFailed: 1, + writeToAWSCount: -1, + writeToAWSEgress: -28, + }) + ) + // should not retry 404 + expect(result.stdout).to.not.include('failed to process file, trying again') + expect(t1 - t0).to.be.below(10_000) + }) + + it('should retry on error', async function () { + await FILESTORE_PERSISTOR.deleteObject( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId0}` + ) + const restoreFileAfter5s = async () => { + await setTimeout(5_000) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileId0}`, + Stream.Readable.from([fileId0.toString()]) + ) + } + // use Promise.allSettled to ensure the above sendStream call finishes before this test completes + const [ + { + value: { stats, result }, + }, + ] = await Promise.allSettled([ + tryRunScript({ + RETRY_DELAY_MS: '100', + RETRIES: '60', + RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests + }), + restoreFileAfter5s(), + ]) + expectNotFoundError(result, 'failed to process file, trying again') + expect(result.status).to.equal(0) + expect({ ...stats, filesRetries: 0 }).to.deep.equal(STATS_ALL) + expect(stats.filesRetries).to.be.greaterThan(0) + }) + + describe('full run CONCURRENCY=1', function () { + let output + beforeEach('run script', async function () { + output = await runScript({ + CONCURRENCY: '1', + }) + }) + + it('should print stats', function () { + expect(output.stats).deep.equal(STATS_ALL) + }) + commonAssertions() + }) + + describe('full run CONCURRENCY=10', function () { + let output + beforeEach('run script', async function () { + output = await runScript({ + CONCURRENCY: '10', + }) + }) + it('should print stats', function () { + expect(output.stats).deep.equal(STATS_ALL) + }) + commonAssertions() + }) + + describe('with something in the bucket already', function () { + beforeEach('create a file in s3', async function () { + const buf = Buffer.from(fileId0.toString()) + await backupPersistor.sendStream( + projectBlobsBucket, + makeProjectKey(historyId0, gitBlobHash(fileId0)), + Stream.Readable.from([buf]), + { contentLength: buf.byteLength } + ) + }) + let output + beforeEach('run script', async function () { + output = await runScript({ + CONCURRENCY: '1', + }) + }) + + it('should print stats', function () { + expect(output.stats).deep.equal( + sumStats(STATS_ALL, { + ...STATS_ALL_ZERO, + // one remote deduplicate + deduplicatedWriteToAWSRemoteCount: 1, + deduplicatedWriteToAWSRemoteEgress: 28, + writeToAWSEgress: -28, // subtract skipped egress + }) + ) + }) + commonAssertions() + }) + + describe('split run CONCURRENCY=1', function () { + // part0: project0+project1, part1: project2 onwards + const edge = projectId1.toString() + let outputPart0, outputPart1 + beforeEach('run script on part 0', async function () { + outputPart0 = await runScript({ + CONCURRENCY: '1', + BATCH_RANGE_END: edge, + }) + }) + beforeEach('run script on part 1', async function () { + outputPart1 = await runScript({ + CONCURRENCY: '1', + BATCH_RANGE_START: edge, + }) + }) + + it('should print stats', function () { + expect(outputPart0.stats).to.deep.equal(STATS_UP_TO_PROJECT1) + expect(outputPart1.stats).to.deep.equal(STATS_UP_FROM_PROJECT1_ONWARD) + }) + commonAssertions() + }) +}) diff --git a/services/history-v1/test/acceptance/js/storage/support/cleanup.js b/services/history-v1/test/acceptance/js/storage/support/cleanup.js index 89eff859bb..b237d4fc17 100644 --- a/services/history-v1/test/acceptance/js/storage/support/cleanup.js +++ b/services/history-v1/test/acceptance/js/storage/support/cleanup.js @@ -14,6 +14,12 @@ const MONGO_COLLECTIONS = [ 'projectHistoryBlobs', 'projectHistoryShardedBlobs', 'projectHistoryChunks', + + // back_fill_file_hash.test.mjs + 'deletedFiles', + 'deletedProjects', + 'projects', + 'projectHistoryBackedUpBlobs', ] // make sure we don't delete the wrong data by accident diff --git a/services/history-v1/test/setup.js b/services/history-v1/test/setup.js index c127661dbc..20f891ceb6 100644 --- a/services/history-v1/test/setup.js +++ b/services/history-v1/test/setup.js @@ -35,6 +35,7 @@ async function createGcsBuckets() { config.get('blobStore.projectBucket'), config.get('chunkStore.bucket'), config.get('zipStore.bucket'), + 'fake-user-files-gcs', ]) { await fetch('http://gcs:9090/storage/v1/b', { method: 'POST',