mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-21 20:47:08 -05:00
Merge pull request #21681 from overleaf/jpa-back-fill-file-hash
[history-v1] add script for back filling hash into project file-tree GitOrigin-RevId: 70ab8c72f3eea1062708e4471b4ad1c60317ad26
This commit is contained in:
parent
d6e94d2586
commit
97b39ef98f
22 changed files with 2363 additions and 12 deletions
|
@ -277,6 +277,7 @@ function batchedUpdateWithResultHandling(
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
READ_PREFERENCE_SECONDARY,
|
||||
batchedUpdate,
|
||||
batchedUpdateWithResultHandling,
|
||||
}
|
||||
|
|
|
@ -35,12 +35,12 @@ module.exports = class AbstractPersistor {
|
|||
/**
|
||||
* @param location
|
||||
* @param name
|
||||
* @param {Object} opts
|
||||
* @param {Number} opts.start
|
||||
* @param {Number} opts.end
|
||||
* @param {Object} [opts]
|
||||
* @param {Number} [opts.start]
|
||||
* @param {Number} [opts.end]
|
||||
* @return {Promise<NodeJS.ReadableStream>}
|
||||
*/
|
||||
async getObjectStream(location, name, opts) {
|
||||
async getObjectStream(location, name, opts = {}) {
|
||||
throw new NotImplementedError('method not implemented in persistor', {
|
||||
method: 'getObjectStream',
|
||||
location,
|
||||
|
|
|
@ -201,6 +201,7 @@ class PerProjectEncryptedS3Persistor extends S3Persistor {
|
|||
// Do not overwrite any objects if already created
|
||||
ifNoneMatch: '*',
|
||||
ssecOptions: await this.#getCurrentKeyEncryptionKey(projectFolder),
|
||||
contentLength: 32,
|
||||
}
|
||||
)
|
||||
return new SSECOptions(dataEncryptionKey)
|
||||
|
@ -404,6 +405,7 @@ class CachedPerProjectEncryptedS3Persistor {
|
|||
* @param {Object} opts
|
||||
* @param {string} [opts.contentType]
|
||||
* @param {string} [opts.contentEncoding]
|
||||
* @param {number} [opts.contentLength]
|
||||
* @param {'*'} [opts.ifNoneMatch]
|
||||
* @param {SSECOptions} [opts.ssecOptions]
|
||||
* @param {string} [opts.sourceMd5]
|
||||
|
|
|
@ -85,6 +85,7 @@ class S3Persistor extends AbstractPersistor {
|
|||
* @param {Object} opts
|
||||
* @param {string} [opts.contentType]
|
||||
* @param {string} [opts.contentEncoding]
|
||||
* @param {number} [opts.contentLength]
|
||||
* @param {'*'} [opts.ifNoneMatch]
|
||||
* @param {SSECOptions} [opts.ssecOptions]
|
||||
* @param {string} [opts.sourceMd5]
|
||||
|
@ -118,6 +119,9 @@ class S3Persistor extends AbstractPersistor {
|
|||
if (opts.contentEncoding) {
|
||||
uploadOptions.ContentEncoding = opts.contentEncoding
|
||||
}
|
||||
if (opts.contentLength) {
|
||||
uploadOptions.ContentLength = opts.contentLength
|
||||
}
|
||||
if (opts.ifNoneMatch === '*') {
|
||||
uploadOptions.IfNoneMatch = '*'
|
||||
}
|
||||
|
@ -134,9 +138,15 @@ class S3Persistor extends AbstractPersistor {
|
|||
clientOptions.computeChecksums = true
|
||||
}
|
||||
|
||||
await this._getClientForBucket(bucketName, clientOptions)
|
||||
.upload(uploadOptions, { partSize: this.settings.partSize })
|
||||
.promise()
|
||||
if (this.settings.disableMultiPartUpload) {
|
||||
await this._getClientForBucket(bucketName, clientOptions)
|
||||
.putObject(uploadOptions)
|
||||
.promise()
|
||||
} else {
|
||||
await this._getClientForBucket(bucketName, clientOptions)
|
||||
.upload(uploadOptions, { partSize: this.settings.partSize })
|
||||
.promise()
|
||||
}
|
||||
} catch (err) {
|
||||
throw PersistorHelper.wrapError(
|
||||
err,
|
||||
|
@ -150,7 +160,7 @@ class S3Persistor extends AbstractPersistor {
|
|||
/**
|
||||
* @param {string} bucketName
|
||||
* @param {string} key
|
||||
* @param {Object} opts
|
||||
* @param {Object} [opts]
|
||||
* @param {number} [opts.start]
|
||||
* @param {number} [opts.end]
|
||||
* @param {boolean} [opts.autoGunzip]
|
||||
|
|
25
package-lock.json
generated
25
package-lock.json
generated
|
@ -5759,6 +5759,17 @@
|
|||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/@google-cloud/secret-manager": {
|
||||
"version": "5.6.0",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/secret-manager/-/secret-manager-5.6.0.tgz",
|
||||
"integrity": "sha512-0daW/OXQEVc6VQKPyJTQNyD+563I/TYQ7GCQJx4dq3lB666R9FUPvqHx9b/o/qQtZ5pfuoCbGZl3krpxgTSW8Q==",
|
||||
"dependencies": {
|
||||
"google-gax": "^4.0.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@google-cloud/storage": {
|
||||
"version": "6.10.1",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/storage/-/storage-6.10.1.tgz",
|
||||
|
@ -39688,10 +39699,13 @@
|
|||
"version": "1.0.0",
|
||||
"license": "Proprietary",
|
||||
"dependencies": {
|
||||
"@google-cloud/secret-manager": "^5.6.0",
|
||||
"@overleaf/logger": "*",
|
||||
"@overleaf/metrics": "*",
|
||||
"@overleaf/mongo-utils": "*",
|
||||
"@overleaf/o-error": "*",
|
||||
"@overleaf/object-persistor": "*",
|
||||
"@overleaf/promise-utils": "*",
|
||||
"@overleaf/redis-wrapper": "*",
|
||||
"@overleaf/settings": "*",
|
||||
"@overleaf/stream-utils": "^0.1.0",
|
||||
|
@ -47889,6 +47903,14 @@
|
|||
"resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-2.0.4.tgz",
|
||||
"integrity": "sha512-j8yRSSqswWi1QqUGKVEKOG03Q7qOoZP6/h2zN2YO+F5h2+DHU0bSrHCK9Y7lo2DI9fBd8qGAw795sf+3Jva4yA=="
|
||||
},
|
||||
"@google-cloud/secret-manager": {
|
||||
"version": "5.6.0",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/secret-manager/-/secret-manager-5.6.0.tgz",
|
||||
"integrity": "sha512-0daW/OXQEVc6VQKPyJTQNyD+563I/TYQ7GCQJx4dq3lB666R9FUPvqHx9b/o/qQtZ5pfuoCbGZl3krpxgTSW8Q==",
|
||||
"requires": {
|
||||
"google-gax": "^4.0.3"
|
||||
}
|
||||
},
|
||||
"@google-cloud/storage": {
|
||||
"version": "6.10.1",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/storage/-/storage-6.10.1.tgz",
|
||||
|
@ -69319,10 +69341,13 @@
|
|||
"overleaf-editor": {
|
||||
"version": "file:services/history-v1",
|
||||
"requires": {
|
||||
"@google-cloud/secret-manager": "^5.6.0",
|
||||
"@overleaf/logger": "*",
|
||||
"@overleaf/metrics": "*",
|
||||
"@overleaf/mongo-utils": "*",
|
||||
"@overleaf/o-error": "*",
|
||||
"@overleaf/object-persistor": "*",
|
||||
"@overleaf/promise-utils": "*",
|
||||
"@overleaf/redis-wrapper": "*",
|
||||
"@overleaf/settings": "*",
|
||||
"@overleaf/stream-utils": "^0.1.0",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
history-v1
|
||||
--dependencies=postgres,gcs,mongo
|
||||
--dependencies=postgres,gcs,mongo,s3
|
||||
--docker-repos=us-east1-docker.pkg.dev/overleaf-ops/ol-docker
|
||||
--env-add=
|
||||
--env-pass-through=
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
"s3": {
|
||||
"key": "AWS_ACCESS_KEY_ID",
|
||||
"secret": "AWS_SECRET_ACCESS_KEY",
|
||||
"endpoint": "AWS_S3_ENDPOINT",
|
||||
"pathStyle": "AWS_S3_PATH_STYLE",
|
||||
"maxRetries": "S3_MAX_RETRIES",
|
||||
"httpOptions": {
|
||||
"timeout": "S3_TIMEOUT"
|
||||
|
@ -30,6 +32,19 @@
|
|||
"buckets": "PERSISTOR_BUCKET_MAPPING"
|
||||
}
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "BACKUP_KEY_ENCRYPTION_KEYS",
|
||||
"s3SSEC": {
|
||||
"key": "AWS_ACCESS_KEY_ID",
|
||||
"secret": "AWS_SECRET_ACCESS_KEY",
|
||||
"endpoint": "AWS_S3_ENDPOINT",
|
||||
"pathStyle": "AWS_S3_PATH_STYLE",
|
||||
"maxRetries": "BACKUP_S3_MAX_RETRIES",
|
||||
"httpOptions": {
|
||||
"timeout": "BACKUP_S3_TIMEOUT"
|
||||
}
|
||||
}
|
||||
},
|
||||
"blobStore": {
|
||||
"globalBucket": "OVERLEAF_EDITOR_BLOBS_BUCKET",
|
||||
"projectBucket": "OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
|
||||
|
@ -42,6 +57,12 @@
|
|||
"bucket": "OVERLEAF_EDITOR_ZIPS_BUCKET",
|
||||
"zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"BACKUP_OVERLEAF_EDITOR_CHUNKS_BUCKET",
|
||||
"deksBucket":"BACKUP_OVERLEAF_EDITOR_DEKS_BUCKET",
|
||||
"globalBlobsBucket":"BACKUP_OVERLEAF_EDITOR_GLOBAL_BLOBS_BUCKET",
|
||||
"projectBlobsBucket":"BACKUP_OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
|
||||
},
|
||||
"mongo": {
|
||||
"uri": "MONGO_CONNECTION_STRING"
|
||||
},
|
||||
|
|
|
@ -13,6 +13,16 @@
|
|||
"deleteConcurrency": "50"
|
||||
}
|
||||
},
|
||||
"backupPersistor": {
|
||||
"backend": "s3SSEC",
|
||||
"s3SSEC": {
|
||||
"maxRetries": "1",
|
||||
"pathStyle": false,
|
||||
"httpOptions": {
|
||||
"timeout": "8000"
|
||||
}
|
||||
}
|
||||
},
|
||||
"chunkStore": {
|
||||
"historyStoreConcurrency": "4"
|
||||
},
|
||||
|
|
|
@ -23,6 +23,18 @@
|
|||
"zipStore": {
|
||||
"bucket": "overleaf-development-zips"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"overleaf-development-history-chunks",
|
||||
"deksBucket":"overleaf-development-history-deks",
|
||||
"globalBlobsBucket":"overleaf-development-history-global-blobs",
|
||||
"projectBlobsBucket":"overleaf-development-history-project-blobs"
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
|
||||
"s3SSEC": {
|
||||
"ca": "[\"/certs/public.crt\"]"
|
||||
}
|
||||
},
|
||||
"useDeleteObjects": "false",
|
||||
"mongo": {
|
||||
"uri": "mongodb://mongo:27017/sharelatex"
|
||||
|
|
|
@ -1 +1,5 @@
|
|||
{ }
|
||||
{
|
||||
"backupPersistor": {
|
||||
"tieringStorageClass": "INTELLIGENT_TIERING"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,19 @@
|
|||
"zipStore": {
|
||||
"bucket": "overleaf-test-zips"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"overleaf-test-history-chunks",
|
||||
"deksBucket":"overleaf-test-history-deks",
|
||||
"globalBlobsBucket":"overleaf-test-history-global-blobs",
|
||||
"projectBlobsBucket":"overleaf-test-history-project-blobs"
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
|
||||
"s3SSEC": {
|
||||
"ca": "[\"/certs/public.crt\"]"
|
||||
},
|
||||
"tieringStorageClass": "REDUCED_REDUNDANCY"
|
||||
},
|
||||
"maxDeleteKeys": "3",
|
||||
"useDeleteObjects": "false",
|
||||
"mongo": {
|
||||
|
|
|
@ -21,17 +21,31 @@ services:
|
|||
ELASTIC_SEARCH_DSN: es:9200
|
||||
MONGO_HOST: mongo
|
||||
POSTGRES_HOST: postgres
|
||||
AWS_S3_ENDPOINT: https://minio:9000
|
||||
AWS_S3_PATH_STYLE: 'true'
|
||||
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
GCS_API_ENDPOINT: http://gcs:9090
|
||||
GCS_PROJECT_ID: fake
|
||||
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
|
||||
MOCHA_GREP: ${MOCHA_GREP}
|
||||
NODE_ENV: test
|
||||
NODE_OPTIONS: "--unhandled-rejections=strict"
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
depends_on:
|
||||
mongo:
|
||||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
minio_setup:
|
||||
condition: service_completed_successfully
|
||||
gcs:
|
||||
condition: service_healthy
|
||||
user: node
|
||||
|
@ -63,6 +77,137 @@ services:
|
|||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
certs:
|
||||
image: node:18.20.4
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /certs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
if [ ! -f ./certgen ]; then
|
||||
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
|
||||
chmod +x ./certgen
|
||||
fi
|
||||
if [ ! -f private.key ] || [ ! -f public.crt ]; then
|
||||
./certgen -host minio
|
||||
fi
|
||||
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
|
||||
command: server /data
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.minio/certs
|
||||
environment:
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
|
||||
minio_setup:
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.mc/certs/CAs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
sleep 1
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
|
||||
mc mb --ignore-existing s3/overleaf-test-history-chunks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-deks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
|
||||
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
|
||||
mc admin user add s3 \
|
||||
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
|
||||
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
echo '
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
|
||||
}
|
||||
]
|
||||
}' > policy-history.json
|
||||
|
||||
mc admin policy create s3 overleaf-history policy-history.json
|
||||
mc admin policy attach s3 overleaf-history \
|
||||
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
gcs:
|
||||
image: fsouza/fake-gcs-server:1.45.2
|
||||
command: ["--port=9090", "--scheme=http"]
|
||||
|
|
|
@ -25,11 +25,18 @@ services:
|
|||
- .:/overleaf/services/history-v1
|
||||
- ../../node_modules:/overleaf/node_modules
|
||||
- ../../libraries:/overleaf/libraries
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /overleaf/services/history-v1
|
||||
environment:
|
||||
ELASTIC_SEARCH_DSN: es:9200
|
||||
MONGO_HOST: mongo
|
||||
POSTGRES_HOST: postgres
|
||||
AWS_S3_ENDPOINT: https://minio:9000
|
||||
AWS_S3_PATH_STYLE: 'true'
|
||||
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
GCS_API_ENDPOINT: http://gcs:9090
|
||||
GCS_PROJECT_ID: fake
|
||||
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
|
||||
|
@ -43,6 +50,12 @@ services:
|
|||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
minio_setup:
|
||||
condition: service_completed_successfully
|
||||
gcs:
|
||||
condition: service_healthy
|
||||
command: npm run --silent test:acceptance
|
||||
|
@ -66,6 +79,137 @@ services:
|
|||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
certs:
|
||||
image: node:18.20.4
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /certs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
if [ ! -f ./certgen ]; then
|
||||
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
|
||||
chmod +x ./certgen
|
||||
fi
|
||||
if [ ! -f private.key ] || [ ! -f public.crt ]; then
|
||||
./certgen -host minio
|
||||
fi
|
||||
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
|
||||
command: server /data
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.minio/certs
|
||||
environment:
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
|
||||
minio_setup:
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.mc/certs/CAs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
sleep 1
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
|
||||
mc mb --ignore-existing s3/overleaf-test-history-chunks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-deks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
|
||||
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
|
||||
mc admin user add s3 \
|
||||
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
|
||||
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
echo '
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
|
||||
}
|
||||
]
|
||||
}' > policy-history.json
|
||||
|
||||
mc admin policy create s3 overleaf-history policy-history.json
|
||||
mc admin policy attach s3 overleaf-history \
|
||||
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
gcs:
|
||||
image: fsouza/fake-gcs-server:1.45.2
|
||||
command: ["--port=9090", "--scheme=http"]
|
||||
|
|
|
@ -6,10 +6,13 @@
|
|||
"license": "Proprietary",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@google-cloud/secret-manager": "^5.6.0",
|
||||
"@overleaf/logger": "*",
|
||||
"@overleaf/metrics": "*",
|
||||
"@overleaf/mongo-utils": "*",
|
||||
"@overleaf/o-error": "*",
|
||||
"@overleaf/object-persistor": "*",
|
||||
"@overleaf/promise-utils": "*",
|
||||
"@overleaf/redis-wrapper": "*",
|
||||
"@overleaf/settings": "*",
|
||||
"@overleaf/stream-utils": "^0.1.0",
|
||||
|
|
107
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
107
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
|
@ -0,0 +1,107 @@
|
|||
// @ts-check
|
||||
import fs from 'node:fs'
|
||||
import Path from 'node:path'
|
||||
import _ from 'lodash'
|
||||
import config from 'config'
|
||||
import { SecretManagerServiceClient } from '@google-cloud/secret-manager'
|
||||
import {
|
||||
PerProjectEncryptedS3Persistor,
|
||||
RootKeyEncryptionKey,
|
||||
} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js'
|
||||
|
||||
const persistorConfig = _.cloneDeep(config.get('backupPersistor'))
|
||||
const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } =
|
||||
config.get('backupStore')
|
||||
|
||||
export { chunksBucket, globalBlobsBucket, projectBlobsBucket }
|
||||
|
||||
function convertKey(key, convertFn) {
|
||||
if (_.has(persistorConfig, key)) {
|
||||
_.update(persistorConfig, key, convertFn)
|
||||
}
|
||||
}
|
||||
|
||||
convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.pathStyle', s => s === 'true')
|
||||
// array of CA, either inlined or on disk
|
||||
convertKey('s3SSEC.ca', s =>
|
||||
JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca))
|
||||
)
|
||||
|
||||
/** @type {() => Promise<string>} */
|
||||
let getRawRootKeyEncryptionKeys
|
||||
|
||||
if ((process.env.NODE_ENV || 'production') === 'production') {
|
||||
;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = (
|
||||
await loadFromSecretsManager(
|
||||
process.env.BACKUP_AWS_CREDENTIALS || '',
|
||||
'BACKUP_AWS_CREDENTIALS'
|
||||
)
|
||||
).split(':')
|
||||
getRawRootKeyEncryptionKeys = () =>
|
||||
loadFromSecretsManager(
|
||||
persistorConfig.keyEncryptionKeys,
|
||||
'BACKUP_KEY_ENCRYPTION_KEYS'
|
||||
)
|
||||
} else {
|
||||
getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys
|
||||
}
|
||||
|
||||
export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none'
|
||||
if (DELETION_ONLY) {
|
||||
// For Backup-deleter; should not encrypt or read data; deleting does not need key.
|
||||
getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {})
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} bucketName
|
||||
* @param {string} path
|
||||
* @return {string}
|
||||
*/
|
||||
function pathToProjectFolder(bucketName, path) {
|
||||
switch (bucketName) {
|
||||
case deksBucket:
|
||||
case chunksBucket:
|
||||
case projectBlobsBucket:
|
||||
return Path.join(...path.split('/').slice(0, 3)) + '/'
|
||||
default:
|
||||
throw new Error(`${bucketName} does not store per-project files`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} name
|
||||
* @param {string} label
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function loadFromSecretsManager(name, label) {
|
||||
const client = new SecretManagerServiceClient()
|
||||
const [version] = await client.accessSecretVersion({ name })
|
||||
if (!version.payload?.data) throw new Error(`empty secret: ${label}`)
|
||||
return version.payload.data.toString()
|
||||
}
|
||||
|
||||
async function getRootKeyEncryptionKeys() {
|
||||
return JSON.parse(await getRawRootKeyEncryptionKeys()).map(
|
||||
({ key, salt }) => {
|
||||
return new RootKeyEncryptionKey(
|
||||
Buffer.from(key, 'base64'),
|
||||
Buffer.from(salt, 'base64')
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
export const backupPersistor = new PerProjectEncryptedS3Persistor({
|
||||
...persistorConfig.s3SSEC,
|
||||
disableMultiPartUpload: true,
|
||||
dataEncryptionKeyBucketName: deksBucket,
|
||||
pathToProjectFolder,
|
||||
getRootKeyEncryptionKeys,
|
||||
storageClass: {
|
||||
[deksBucket]: 'STANDARD',
|
||||
[chunksBucket]: persistorConfig.tieringStorageClass,
|
||||
[projectBlobsBucket]: persistorConfig.tieringStorageClass,
|
||||
},
|
||||
})
|
|
@ -337,4 +337,4 @@ class BlobStore {
|
|||
}
|
||||
}
|
||||
|
||||
module.exports = { BlobStore, loadGlobalBlobs }
|
||||
module.exports = { BlobStore, loadGlobalBlobs, makeProjectKey, GLOBAL_BLOBS }
|
||||
|
|
|
@ -10,7 +10,19 @@ const chunks = db.collection('projectHistoryChunks')
|
|||
const blobs = db.collection('projectHistoryBlobs')
|
||||
const globalBlobs = db.collection('projectHistoryGlobalBlobs')
|
||||
const shardedBlobs = db.collection('projectHistoryShardedBlobs')
|
||||
// Temporary collection for tracking progress of backed up old blobs (without a hash).
|
||||
// The initial sync process will be able to skip over these.
|
||||
// Schema: _id: projectId, blobs: [Binary]
|
||||
const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs')
|
||||
|
||||
Metrics.mongodb.monitor(client)
|
||||
|
||||
module.exports = { client, db, chunks, blobs, globalBlobs, shardedBlobs }
|
||||
module.exports = {
|
||||
client,
|
||||
db,
|
||||
chunks,
|
||||
blobs,
|
||||
globalBlobs,
|
||||
shardedBlobs,
|
||||
backedUpBlobs,
|
||||
}
|
||||
|
|
824
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
824
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
|
@ -0,0 +1,824 @@
|
|||
// @ts-check
|
||||
import Crypto from 'node:crypto'
|
||||
import Events from 'node:events'
|
||||
import fs from 'node:fs'
|
||||
import Path from 'node:path'
|
||||
import Stream from 'node:stream'
|
||||
import zLib from 'node:zlib'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { Binary, ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger'
|
||||
import {
|
||||
batchedUpdate,
|
||||
READ_PREFERENCE_SECONDARY,
|
||||
} from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import OError from '@overleaf/o-error'
|
||||
import {
|
||||
AlreadyWrittenError,
|
||||
NoKEKMatchedError,
|
||||
NotFoundError,
|
||||
} from '@overleaf/object-persistor/src/Errors.js'
|
||||
import { promiseMapWithLimit } from '@overleaf/promise-utils'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import {
|
||||
BlobStore,
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { backedUpBlobs, db } from '../lib/mongodb.js'
|
||||
import filestorePersistor from '../lib/persistor.js'
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
/**
|
||||
* @typedef {import("overleaf-editor-core").Blob} Blob
|
||||
* @typedef {import("mongodb").Collection} Collection
|
||||
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} FileRef
|
||||
* @property {ObjectId} _id
|
||||
* @property {string} hash
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Folder
|
||||
* @property {Array<Folder>} folders
|
||||
* @property {Array<FileRef>} fileRefs
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} DeletedFileRef
|
||||
* @property {ObjectId} _id
|
||||
* @property {ObjectId} projectId
|
||||
* @property {string} hash
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Project
|
||||
* @property {ObjectId} _id
|
||||
* @property {Array<Folder>} rootFolder
|
||||
* @property {Array<string>} deletedFileIds
|
||||
* @property {{history: {id: string}}} overleaf
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} QueueEntry
|
||||
* @property {ProjectContext} ctx
|
||||
* @property {string} fileId
|
||||
* @property {string} path
|
||||
* @property {string} [hash]
|
||||
*/
|
||||
|
||||
// Time of closing the ticket for adding hashes: https://github.com/overleaf/internal/issues/464#issuecomment-492668129
|
||||
const ALL_PROJECTS_HAVE_FILE_HASHES_AFTER = new Date('2019-05-15T14:02:00Z')
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const BATCH_RANGE_START =
|
||||
process.env.BATCH_RANGE_START ||
|
||||
ObjectId.createFromTime(PUBLIC_LAUNCH_DATE.getTime() / 1000).toString()
|
||||
const BATCH_RANGE_END =
|
||||
process.env.BATCH_RANGE_END ||
|
||||
ObjectId.createFromTime(
|
||||
ALL_PROJECTS_HAVE_FILE_HASHES_AFTER.getTime() / 1000
|
||||
).toString()
|
||||
// We need to control the start and end as ids of deleted projects are created at time of deletion.
|
||||
delete process.env.BATCH_RANGE_START
|
||||
delete process.env.BATCH_RANGE_END
|
||||
|
||||
// Concurrency for downloading from GCS and updating hashes in mongo
|
||||
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
|
||||
// Retries for processing a given file
|
||||
const RETRIES = parseInt(process.env.RETRIES || '10', 10)
|
||||
const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10)
|
||||
|
||||
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
|
||||
if (!USER_FILES_BUCKET_NAME) {
|
||||
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
|
||||
}
|
||||
const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true'
|
||||
const BUFFER_DIR = fs.mkdtempSync(
|
||||
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
|
||||
)
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
const deletedFilesCollection = db.collection('deletedFiles')
|
||||
|
||||
const STATS = {
|
||||
projects: 0,
|
||||
filesWithoutHash: 0,
|
||||
filesDuplicated: 0,
|
||||
filesRetries: 0,
|
||||
filesFailed: 0,
|
||||
fileTreeUpdated: 0,
|
||||
globalBlobsCount: 0,
|
||||
globalBlobsEgress: 0,
|
||||
projectDeleted: 0,
|
||||
projectHardDeleted: 0,
|
||||
fileHardDeleted: 0,
|
||||
mongoUpdates: 0,
|
||||
deduplicatedWriteToAWSLocalCount: 0,
|
||||
deduplicatedWriteToAWSLocalEgress: 0,
|
||||
deduplicatedWriteToAWSRemoteCount: 0,
|
||||
deduplicatedWriteToAWSRemoteEgress: 0,
|
||||
writeToAWSCount: 0,
|
||||
writeToAWSEgress: 0,
|
||||
}
|
||||
|
||||
function printStats() {
|
||||
console.log(
|
||||
JSON.stringify({
|
||||
time: new Date(),
|
||||
...STATS,
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
setInterval(printStats, 60_000)
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function processFile(entry) {
|
||||
for (let attempt = 0; attempt < RETRIES; attempt++) {
|
||||
try {
|
||||
return await processFileOnce(entry)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
const { bucketName } = OError.getFullInfo(err)
|
||||
if (bucketName === USER_FILES_BUCKET_NAME && !RETRY_FILESTORE_404) {
|
||||
throw err // disable retries for not found in filestore bucket case
|
||||
}
|
||||
}
|
||||
if (err instanceof NoKEKMatchedError) {
|
||||
throw err // disable retries when upload to S3 will fail again
|
||||
}
|
||||
STATS.filesRetries++
|
||||
const {
|
||||
ctx: { projectId },
|
||||
fileId,
|
||||
path,
|
||||
} = entry
|
||||
logger.warn(
|
||||
{ err, projectId, fileId, path, attempt },
|
||||
'failed to process file, trying again'
|
||||
)
|
||||
await setTimeout(RETRY_DELAY_MS)
|
||||
}
|
||||
}
|
||||
return await processFileOnce(entry)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function processFileOnce(entry) {
|
||||
const { fileId } = entry
|
||||
const { projectId, historyId } = entry.ctx
|
||||
const filePath = Path.join(
|
||||
BUFFER_DIR,
|
||||
projectId.toString() + fileId.toString()
|
||||
)
|
||||
const dst = fs.createWriteStream(filePath)
|
||||
const src = await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
`${projectId}/${fileId}`
|
||||
)
|
||||
await Stream.promises.pipeline(src, dst)
|
||||
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.putFile(filePath)
|
||||
const hash = blob.getHash()
|
||||
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
STATS.globalBlobsCount++
|
||||
STATS.globalBlobsEgress += estimateBlobSize(blob)
|
||||
return hash
|
||||
}
|
||||
|
||||
if (entry.ctx.hasBackedUpBlob(hash)) {
|
||||
STATS.deduplicatedWriteToAWSLocalCount++
|
||||
STATS.deduplicatedWriteToAWSLocalEgress += estimateBlobSize(blob)
|
||||
return hash
|
||||
}
|
||||
entry.ctx.recordPendingBlob(hash)
|
||||
|
||||
let backupSource
|
||||
let contentEncoding
|
||||
const md5 = Crypto.createHash('md5')
|
||||
let size
|
||||
if (blob.getStringLength()) {
|
||||
const filePathCompressed = filePath + '.gz'
|
||||
backupSource = filePathCompressed
|
||||
contentEncoding = 'gzip'
|
||||
size = 0
|
||||
await Stream.promises.pipeline(
|
||||
fs.createReadStream(filePath),
|
||||
zLib.createGzip(),
|
||||
async function* (source) {
|
||||
for await (const chunk of source) {
|
||||
size += chunk.byteLength
|
||||
md5.update(chunk)
|
||||
yield chunk
|
||||
}
|
||||
},
|
||||
fs.createWriteStream(filePathCompressed)
|
||||
)
|
||||
} else {
|
||||
backupSource = filePath
|
||||
size = blob.getByteLength()
|
||||
await Stream.promises.pipeline(fs.createReadStream(filePath), md5)
|
||||
}
|
||||
const backendKeyPath = makeProjectKey(historyId, blob.getHash())
|
||||
const persistor = await entry.ctx.getCachedPersistor(backendKeyPath)
|
||||
try {
|
||||
STATS.writeToAWSCount++
|
||||
await persistor.sendStream(
|
||||
projectBlobsBucket,
|
||||
backendKeyPath,
|
||||
fs.createReadStream(backupSource),
|
||||
{
|
||||
contentEncoding,
|
||||
contentType: 'application/octet-stream',
|
||||
contentLength: size,
|
||||
sourceMd5: md5.digest('hex'),
|
||||
ifNoneMatch: '*', // de-duplicate write (we pay for the request, but avoid egress)
|
||||
}
|
||||
)
|
||||
STATS.writeToAWSEgress += size
|
||||
} catch (err) {
|
||||
if (err instanceof AlreadyWrittenError) {
|
||||
STATS.deduplicatedWriteToAWSRemoteCount++
|
||||
STATS.deduplicatedWriteToAWSRemoteEgress += size
|
||||
} else {
|
||||
STATS.writeToAWSEgress += size
|
||||
entry.ctx.recordFailedBlob(hash)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
entry.ctx.recordBackedUpBlob(hash)
|
||||
return hash
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<QueueEntry>} files
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function processFiles(files) {
|
||||
if (files.length === 0) return // all processed
|
||||
await fs.promises.mkdir(BUFFER_DIR, { recursive: true })
|
||||
try {
|
||||
await promiseMapWithLimit(
|
||||
CONCURRENCY,
|
||||
files,
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function (entry) {
|
||||
try {
|
||||
await entry.ctx.processFile(entry)
|
||||
} catch (err) {
|
||||
STATS.filesFailed++
|
||||
const {
|
||||
ctx: { projectId },
|
||||
fileId,
|
||||
path,
|
||||
} = entry
|
||||
logger.error(
|
||||
{ err, projectId, fileId, path },
|
||||
'failed to process file'
|
||||
)
|
||||
}
|
||||
}
|
||||
)
|
||||
} finally {
|
||||
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<Project>} batch
|
||||
* @param {string} prefix
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function handleLiveTreeBatch(batch, prefix = 'rootFolder.0') {
|
||||
if (process.argv.includes('deletedFiles')) {
|
||||
await collectDeletedFiles(batch)
|
||||
}
|
||||
const files = Array.from(findFileInBatch(batch, prefix))
|
||||
STATS.projects += batch.length
|
||||
STATS.filesWithoutHash += files.length
|
||||
batch.length = 0 // GC
|
||||
// The files are currently ordered by project-id.
|
||||
// Order them by file-id to
|
||||
// - avoid head-of-line blocking from many project-files waiting on the generation of the projects DEK (round trip to AWS)
|
||||
// - bonus: increase chance of de-duplicating write to AWS
|
||||
files.sort((a, b) => (a.fileId > b.fileId ? 1 : -1))
|
||||
await processFiles(files)
|
||||
await promiseMapWithLimit(
|
||||
CONCURRENCY,
|
||||
files,
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function (entry) {
|
||||
await entry.ctx.flushMongoQueues()
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<{project: Project}>} batch
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function handleDeletedFileTreeBatch(batch) {
|
||||
await handleLiveTreeBatch(
|
||||
batch.map(d => d.project),
|
||||
'project.rootFolder.0'
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function tryUpdateFileRefInMongo(entry) {
|
||||
if (entry.path === '') {
|
||||
return await tryUpdateDeletedFileRefInMongo(entry)
|
||||
} else if (entry.path.startsWith('project.')) {
|
||||
return await tryUpdateFileRefInMongoInDeletedProject(entry)
|
||||
}
|
||||
|
||||
STATS.mongoUpdates++
|
||||
const result = await projectsCollection.updateOne(
|
||||
{
|
||||
_id: entry.ctx.projectId,
|
||||
[`${entry.path}._id`]: new ObjectId(entry.fileId),
|
||||
},
|
||||
{
|
||||
$set: { [`${entry.path}.hash`]: entry.hash },
|
||||
}
|
||||
)
|
||||
return result.matchedCount === 1
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function tryUpdateDeletedFileRefInMongo(entry) {
|
||||
STATS.mongoUpdates++
|
||||
const result = await deletedFilesCollection.updateOne(
|
||||
{
|
||||
_id: new ObjectId(entry.fileId),
|
||||
projectId: entry.ctx.projectId,
|
||||
},
|
||||
{ $set: { hash: entry.hash } }
|
||||
)
|
||||
return result.matchedCount === 1
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function tryUpdateFileRefInMongoInDeletedProject(entry) {
|
||||
STATS.mongoUpdates++
|
||||
const result = await deletedProjectsCollection.updateOne(
|
||||
{
|
||||
'deleterData.deletedProjectId': entry.ctx.projectId,
|
||||
[`${entry.path}._id`]: new ObjectId(entry.fileId),
|
||||
},
|
||||
{
|
||||
$set: { [`${entry.path}.hash`]: entry.hash },
|
||||
}
|
||||
)
|
||||
return result.matchedCount === 1
|
||||
}
|
||||
|
||||
const RETRY_UPDATE_HASH = 100
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function updateFileRefInMongo(entry) {
|
||||
if (await tryUpdateFileRefInMongo(entry)) return
|
||||
|
||||
const { fileId } = entry
|
||||
const { projectId } = entry.ctx
|
||||
for (let i = 0; i < RETRY_UPDATE_HASH; i++) {
|
||||
let prefix = 'rootFolder.0'
|
||||
let p = await projectsCollection.findOne(
|
||||
{ _id: projectId },
|
||||
{ projection: { rootFolder: 1 } }
|
||||
)
|
||||
if (!p) {
|
||||
STATS.projectDeleted++
|
||||
prefix = 'project.rootFolder.0'
|
||||
const deletedProject = await deletedProjectsCollection.findOne(
|
||||
{
|
||||
'deleterData.deletedProjectId': projectId,
|
||||
project: { $exists: true },
|
||||
},
|
||||
{ projection: { 'project.rootFolder': 1 } }
|
||||
)
|
||||
p = deletedProject?.project
|
||||
if (!p) {
|
||||
STATS.projectHardDeleted++
|
||||
console.warn(
|
||||
'bug: project hard-deleted while processing',
|
||||
projectId,
|
||||
fileId
|
||||
)
|
||||
return
|
||||
}
|
||||
}
|
||||
let found = false
|
||||
for (const e of findFiles(entry.ctx, p.rootFolder[0], prefix)) {
|
||||
found = e.fileId === fileId
|
||||
if (!found) continue
|
||||
if (await tryUpdateFileRefInMongo(e)) return
|
||||
break
|
||||
}
|
||||
if (!found) {
|
||||
if (await tryUpdateDeletedFileRefInMongo(entry)) return
|
||||
STATS.fileHardDeleted++
|
||||
console.warn('bug: file hard-deleted while processing', projectId, fileId)
|
||||
return
|
||||
}
|
||||
|
||||
STATS.fileTreeUpdated++
|
||||
}
|
||||
throw new OError(
|
||||
'file-tree updated repeatedly while trying to add hash',
|
||||
entry
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ProjectContext} ctx
|
||||
* @param {Folder} folder
|
||||
* @param {string} path
|
||||
* @return Generator<QueueEntry>
|
||||
*/
|
||||
function* findFiles(ctx, folder, path) {
|
||||
let i = 0
|
||||
for (const child of folder.folders) {
|
||||
yield* findFiles(ctx, child, `${path}.folders.${i}`)
|
||||
i++
|
||||
}
|
||||
i = 0
|
||||
for (const fileRef of folder.fileRefs) {
|
||||
if (!fileRef.hash) {
|
||||
yield {
|
||||
ctx,
|
||||
fileId: fileRef._id.toString(),
|
||||
path: `${path}.fileRefs.${i}`,
|
||||
}
|
||||
}
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<Project>} projects
|
||||
* @param {string} prefix
|
||||
*/
|
||||
function* findFileInBatch(projects, prefix) {
|
||||
for (const project of projects) {
|
||||
const ctx = new ProjectContext(project)
|
||||
yield* findFiles(ctx, project.rootFolder[0], prefix)
|
||||
for (const fileId of project.deletedFileIds || []) {
|
||||
yield { ctx, fileId, path: '' }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<Project>} projects
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function collectDeletedFiles(projects) {
|
||||
const cursor = deletedFilesCollection.find(
|
||||
{
|
||||
projectId: { $in: projects.map(p => p._id) },
|
||||
hash: { $exists: false },
|
||||
},
|
||||
{
|
||||
projection: { _id: 1, projectId: 1 },
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
sort: { projectId: 1 },
|
||||
}
|
||||
)
|
||||
const processed = projects.slice()
|
||||
for await (const deletedFileRef of cursor) {
|
||||
const idx = processed.findIndex(
|
||||
p => p._id.toString() === deletedFileRef.projectId.toString()
|
||||
)
|
||||
if (idx === -1) {
|
||||
throw new Error(
|
||||
`bug: order of deletedFiles mongo records does not match batch of projects (${deletedFileRef.projectId} out of order)`
|
||||
)
|
||||
}
|
||||
processed.splice(0, idx)
|
||||
const project = processed[0]
|
||||
project.deletedFileIds = project.deletedFileIds || []
|
||||
project.deletedFileIds.push(deletedFileRef._id.toString())
|
||||
}
|
||||
}
|
||||
|
||||
const BATCH_HASH_WRITES = 1_000
|
||||
const BATCH_FILE_UPDATES = 100
|
||||
|
||||
class ProjectContext {
|
||||
/** @type {Promise<CachedPerProjectEncryptedS3Persistor> | null} */
|
||||
#cachedPersistorPromise = null
|
||||
|
||||
/**
|
||||
* @param {Project} project
|
||||
*/
|
||||
constructor(project) {
|
||||
this.projectId = project._id
|
||||
this.historyId = project.overleaf.history.id.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} key
|
||||
* @return {Promise<CachedPerProjectEncryptedS3Persistor>}
|
||||
*/
|
||||
getCachedPersistor(key) {
|
||||
if (!this.#cachedPersistorPromise) {
|
||||
// Fetch DEK once, but only if needed -- upon the first use
|
||||
this.#cachedPersistorPromise = this.#getCachedPersistorWithRetries(key)
|
||||
}
|
||||
return this.#cachedPersistorPromise
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} key
|
||||
* @return {Promise<CachedPerProjectEncryptedS3Persistor>}
|
||||
*/
|
||||
async #getCachedPersistorWithRetries(key) {
|
||||
for (let attempt = 0; attempt < RETRIES; attempt++) {
|
||||
try {
|
||||
return await backupPersistor.forProject(projectBlobsBucket, key)
|
||||
} catch (err) {
|
||||
if (err instanceof NoKEKMatchedError) {
|
||||
throw err
|
||||
} else {
|
||||
logger.warn(
|
||||
{ err, projectId: this.projectId, attempt },
|
||||
'failed to get DEK, trying again'
|
||||
)
|
||||
await setTimeout(RETRY_DELAY_MS)
|
||||
}
|
||||
}
|
||||
}
|
||||
return await backupPersistor.forProject(projectBlobsBucket, key)
|
||||
}
|
||||
|
||||
async flushMongoQueuesIfNeeded() {
|
||||
if (this.#completedBlobs.size > BATCH_HASH_WRITES) {
|
||||
await this.#storeBackedUpBlobs()
|
||||
}
|
||||
if (this.#pendingFileWrites.length > BATCH_FILE_UPDATES) {
|
||||
await this.#storeFileHashes()
|
||||
}
|
||||
}
|
||||
|
||||
async flushMongoQueues() {
|
||||
await this.#storeBackedUpBlobs()
|
||||
await this.#storeFileHashes()
|
||||
}
|
||||
|
||||
/** @type {Set<string>} */
|
||||
#pendingBlobs = new Set()
|
||||
/** @type {Set<string>} */
|
||||
#completedBlobs = new Set()
|
||||
|
||||
async #storeBackedUpBlobs() {
|
||||
if (this.#completedBlobs.size === 0) return
|
||||
const blobs = Array.from(this.#completedBlobs).map(
|
||||
hash => new Binary(Buffer.from(hash, 'hex'))
|
||||
)
|
||||
this.#completedBlobs.clear()
|
||||
STATS.mongoUpdates++
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: this.projectId },
|
||||
{ $addToSet: { blobs: { $each: blobs } } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} hash
|
||||
*/
|
||||
recordPendingBlob(hash) {
|
||||
this.#pendingBlobs.add(hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} hash
|
||||
*/
|
||||
recordFailedBlob(hash) {
|
||||
this.#pendingBlobs.delete(hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} hash
|
||||
*/
|
||||
recordBackedUpBlob(hash) {
|
||||
this.#completedBlobs.add(hash)
|
||||
this.#pendingBlobs.delete(hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} hash
|
||||
* @return {boolean}
|
||||
*/
|
||||
hasBackedUpBlob(hash) {
|
||||
return this.#pendingBlobs.has(hash) || this.#completedBlobs.has(hash)
|
||||
}
|
||||
|
||||
/** @type {Array<QueueEntry>} */
|
||||
#pendingFileWrites = []
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
*/
|
||||
queueFileForWritingHash(entry) {
|
||||
this.#pendingFileWrites.push(entry)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Collection} collection
|
||||
* @param {Array<QueueEntry>} entries
|
||||
* @param {Object} query
|
||||
* @return {Promise<Array<QueueEntry>>}
|
||||
*/
|
||||
async #tryBatchHashWrites(collection, entries, query) {
|
||||
if (entries.length === 0) return []
|
||||
const update = {}
|
||||
for (const entry of entries) {
|
||||
query[`${entry.path}._id`] = new ObjectId(entry.fileId)
|
||||
update[`${entry.path}.hash`] = entry.hash
|
||||
}
|
||||
STATS.mongoUpdates++
|
||||
const result = await collection.updateOne(query, { $set: update })
|
||||
if (result.matchedCount === 1) {
|
||||
return [] // all updated
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
async #storeFileHashes() {
|
||||
if (this.#pendingFileWrites.length === 0) return
|
||||
const individualUpdates = []
|
||||
const projectEntries = []
|
||||
const deletedProjectEntries = []
|
||||
for (const entry of this.#pendingFileWrites) {
|
||||
if (entry.path === '') {
|
||||
individualUpdates.push(entry)
|
||||
} else if (entry.path.startsWith('project.')) {
|
||||
deletedProjectEntries.push(entry)
|
||||
} else {
|
||||
projectEntries.push(entry)
|
||||
}
|
||||
}
|
||||
this.#pendingFileWrites.length = 0
|
||||
|
||||
// Try to process them together, otherwise fallback to individual updates and retries.
|
||||
individualUpdates.push(
|
||||
...(await this.#tryBatchHashWrites(projectsCollection, projectEntries, {
|
||||
_id: this.projectId,
|
||||
}))
|
||||
)
|
||||
individualUpdates.push(
|
||||
...(await this.#tryBatchHashWrites(
|
||||
deletedProjectsCollection,
|
||||
deletedProjectEntries,
|
||||
{ 'deleterData.deletedProjectId': this.projectId }
|
||||
))
|
||||
)
|
||||
for (const entry of individualUpdates) {
|
||||
await updateFileRefInMongo(entry)
|
||||
}
|
||||
}
|
||||
|
||||
/** @type {Map<string, Promise<string>>} */
|
||||
#pendingFiles = new Map()
|
||||
|
||||
/**
|
||||
* @param {QueueEntry} entry
|
||||
*/
|
||||
async processFile(entry) {
|
||||
if (this.#pendingFiles.has(entry.fileId)) {
|
||||
STATS.filesDuplicated++
|
||||
} else {
|
||||
this.#pendingFiles.set(entry.fileId, processFile(entry))
|
||||
}
|
||||
entry.hash = await this.#pendingFiles.get(entry.fileId)
|
||||
this.queueFileForWritingHash(entry)
|
||||
await this.flushMongoQueuesIfNeeded()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Blob} blob
|
||||
* @return {number}
|
||||
*/
|
||||
function estimateBlobSize(blob) {
|
||||
let size = blob.getByteLength()
|
||||
if (blob.getStringLength()) {
|
||||
// approximation for gzip (25 bytes gzip overhead and 20% compression ratio)
|
||||
size = 25 + Math.ceil(size * 0.2)
|
||||
}
|
||||
return size
|
||||
}
|
||||
|
||||
async function updateLiveFileTrees() {
|
||||
await batchedUpdate(
|
||||
projectsCollection,
|
||||
{ 'overleaf.history.id': { $exists: true } },
|
||||
handleLiveTreeBatch,
|
||||
{ rootFolder: 1, _id: 1, 'overleaf.history.id': 1 },
|
||||
{},
|
||||
{
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
}
|
||||
)
|
||||
console.warn('Done updating live projects')
|
||||
}
|
||||
|
||||
async function updateDeletedFileTrees() {
|
||||
await batchedUpdate(
|
||||
deletedProjectsCollection,
|
||||
{
|
||||
'deleterData.deletedProjectId': {
|
||||
$gt: new ObjectId(BATCH_RANGE_START),
|
||||
$lte: new ObjectId(BATCH_RANGE_END),
|
||||
},
|
||||
'project.overleaf.history.id': { $exists: true },
|
||||
},
|
||||
handleDeletedFileTreeBatch,
|
||||
{
|
||||
'project.rootFolder': 1,
|
||||
'project._id': 1,
|
||||
'project.overleaf.history.id': 1,
|
||||
}
|
||||
)
|
||||
console.warn('Done updating deleted projects')
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
if (process.argv.includes('live')) {
|
||||
await updateLiveFileTrees()
|
||||
}
|
||||
if (process.argv.includes('deleted')) {
|
||||
await updateDeletedFileTrees()
|
||||
}
|
||||
console.warn('Done.')
|
||||
}
|
||||
|
||||
try {
|
||||
try {
|
||||
await main()
|
||||
} finally {
|
||||
printStats()
|
||||
}
|
||||
|
||||
let code = 0
|
||||
if (STATS.filesFailed > 0) {
|
||||
console.warn('Some files could not be processed, see logs and try again')
|
||||
code++
|
||||
}
|
||||
if (STATS.fileHardDeleted > 0) {
|
||||
console.warn(
|
||||
'Some hashes could not be updated as the files were hard-deleted, this should not happen'
|
||||
)
|
||||
code++
|
||||
}
|
||||
if (STATS.projectHardDeleted > 0) {
|
||||
console.warn(
|
||||
'Some hashes could not be updated as the project was hard-deleted, this should not happen'
|
||||
)
|
||||
code++
|
||||
}
|
||||
process.exit(code)
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
}
|
2
services/history-v1/test/acceptance/certs/.gitignore
vendored
Normal file
2
services/history-v1/test/acceptance/certs/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*
|
||||
!.gitignore
|
File diff suppressed because it is too large
Load diff
|
@ -14,6 +14,12 @@ const MONGO_COLLECTIONS = [
|
|||
'projectHistoryBlobs',
|
||||
'projectHistoryShardedBlobs',
|
||||
'projectHistoryChunks',
|
||||
|
||||
// back_fill_file_hash.test.mjs
|
||||
'deletedFiles',
|
||||
'deletedProjects',
|
||||
'projects',
|
||||
'projectHistoryBackedUpBlobs',
|
||||
]
|
||||
|
||||
// make sure we don't delete the wrong data by accident
|
||||
|
|
|
@ -35,6 +35,7 @@ async function createGcsBuckets() {
|
|||
config.get('blobStore.projectBucket'),
|
||||
config.get('chunkStore.bucket'),
|
||||
config.get('zipStore.bucket'),
|
||||
'fake-user-files-gcs',
|
||||
]) {
|
||||
await fetch('http://gcs:9090/storage/v1/b', {
|
||||
method: 'POST',
|
||||
|
|
Loading…
Reference in a new issue