mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-21 20:47:08 -05:00
Merge pull request #2641 from overleaf/ew-delete-orphaned-docs
Process and Script to Delete Orphaned Docs GitOrigin-RevId: 0c428e36973a9131e3d379fdf4657e802576b23d
This commit is contained in:
parent
620b95fbe8
commit
6734d9e107
4 changed files with 219 additions and 1 deletions
|
@ -10,7 +10,8 @@ const db = mongojs(Settings.mongo.url, [
|
||||||
'users',
|
'users',
|
||||||
'tokens',
|
'tokens',
|
||||||
'docSnapshots',
|
'docSnapshots',
|
||||||
'projectHistoryFailures'
|
'projectHistoryFailures',
|
||||||
|
'deletedProjects'
|
||||||
])
|
])
|
||||||
module.exports = {
|
module.exports = {
|
||||||
db,
|
db,
|
||||||
|
|
3
services/web/scripts/delete-orphaned-docs/.gitignore
vendored
Normal file
3
services/web/scripts/delete-orphaned-docs/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
*.csv
|
||||||
|
*.csv.gz
|
||||||
|
node_modules
|
75
services/web/scripts/delete-orphaned-docs/README.md
Normal file
75
services/web/scripts/delete-orphaned-docs/README.md
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
# Delete Orphaned Docs
|
||||||
|
|
||||||
|
Because of the large numbers of documents and projects it is necessary to detect
|
||||||
|
orphaned docs using bulk exports of the raw data.
|
||||||
|
|
||||||
|
## Exporting Data Files
|
||||||
|
|
||||||
|
Follow the directions in `google-ops/README.md` for exporting data from mongo
|
||||||
|
and copying the files to your local machine.
|
||||||
|
|
||||||
|
### Exporting docs
|
||||||
|
|
||||||
|
Run the following doc export command to export all doc ids and their associated
|
||||||
|
project ids in batches of 10,000,000.
|
||||||
|
```
|
||||||
|
mongoexport --uri $READ_ONLY_MONGO_CONNECTION_STRING --collection docs --fields '_id,project_id' --skip 0 --limit 10000000 --type=csv --out docs.00000000.csv
|
||||||
|
```
|
||||||
|
This will produce files like:
|
||||||
|
```
|
||||||
|
_id,project_id
|
||||||
|
ObjectId(5babb6f864c952737a9a4c32),ObjectId(5b98bba5e2f38b7c88f6a625)
|
||||||
|
ObjectId(4eecaffcbffa66588e000007),ObjectId(4eecaffcbffa66588e00000d)
|
||||||
|
```
|
||||||
|
Concatenate these into a single file: `cat docs.*csv > all-docs-doc_id-project_id.csv`
|
||||||
|
|
||||||
|
For object ids the script will accept either plain hex strings or the `ObjectId(...)`
|
||||||
|
format used by mongoexport.
|
||||||
|
|
||||||
|
### Exporting Projects
|
||||||
|
|
||||||
|
Export project ids from all `projects` and `deletedProjects`
|
||||||
|
```
|
||||||
|
mongoexport --uri $READ_ONLY_MONGO_CONNECTION_STRING --collection projects --fields '_id' --type=csv --out projects.csv
|
||||||
|
mongoexport --uri $READ_ONLY_MONGO_CONNECTION_STRING --collection deletedProjects --fields 'project._id' --type=csv --out deleted-projects.csv
|
||||||
|
```
|
||||||
|
Concatenate these: `cat projects.csv deleted-projects.csv > all-projects-project_id.csv`
|
||||||
|
|
||||||
|
## Processing Exported Data
|
||||||
|
|
||||||
|
### Create a unique sorted list of project ids from docs
|
||||||
|
```
|
||||||
|
cut -d, -f 2 all-docs-doc_id-project_id.csv | sort | uniq > all-docs-project_ids.sorted.uniq.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create a unique sorted list of projects ids from projects
|
||||||
|
```
|
||||||
|
sort all-projects-project_id.csv | uniq > all-projects-project_id.sorted.uniq.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create list of project ids in docs but not in projects
|
||||||
|
```
|
||||||
|
comm --check-order -23 all-docs-project_ids.sorted.uniq.csv all-projects-project_id.sorted.uniq.csv > orphaned-doc-project_ids.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create list of docs ids with project ids not in projects
|
||||||
|
```
|
||||||
|
grep -F -f orphaned-doc-project_ids.csv all-docs-doc_id-project_id.csv > orphaned-doc-doc_id-project_id.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run doc deleter
|
||||||
|
```
|
||||||
|
node delete-orphaned-docs orphaned-doc-doc_id-project_id.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Commit Changes
|
||||||
|
|
||||||
|
By default the script will only print the list of project ids and docs ids to be
|
||||||
|
deleted. In order to actually delete docs run with the `--commit` argument.
|
||||||
|
|
||||||
|
### Selecting Input Lines to Process
|
||||||
|
|
||||||
|
The `--limit` and `--offset` arguments can be used to specify which lines to
|
||||||
|
process. There is one doc per line so a single project will often have multiple
|
||||||
|
lines, but deletion is based on project id, so if one doc for a project is
|
||||||
|
deleted all will be deleted, even if all of the input lines are not processed.
|
|
@ -0,0 +1,139 @@
|
||||||
|
'use strict'
|
||||||
|
|
||||||
|
const fs = require('fs')
|
||||||
|
const minimist = require('minimist')
|
||||||
|
const readline = require('readline')
|
||||||
|
|
||||||
|
const { db, ObjectId } = require('../../app/src/infrastructure/mongojs')
|
||||||
|
const DocstoreManager = require('../../app/src/Features/Docstore/DocstoreManager')
|
||||||
|
.promises
|
||||||
|
|
||||||
|
const argv = minimist(process.argv.slice(2))
|
||||||
|
const commit = argv.commit !== undefined
|
||||||
|
const offset = parseInt(argv.offset) || 0
|
||||||
|
const limit = parseInt(argv.limit) || 0
|
||||||
|
|
||||||
|
if (!commit) {
|
||||||
|
console.log('DOING DRY RUN. TO SAVE CHANGES PASS --commit')
|
||||||
|
}
|
||||||
|
|
||||||
|
const input = fs.createReadStream(argv._[0])
|
||||||
|
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
crlfDelay: Infinity,
|
||||||
|
input
|
||||||
|
})
|
||||||
|
|
||||||
|
const orphanedDocs = {}
|
||||||
|
|
||||||
|
console.log('Loading Data')
|
||||||
|
|
||||||
|
let idx = 0
|
||||||
|
let processed = 0
|
||||||
|
|
||||||
|
rl.on('line', async line => {
|
||||||
|
if (offset && idx++ < offset) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if (limit && processed++ >= limit) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let [docId, projectId] = line.split(',')
|
||||||
|
docId = docId.replace(/^ObjectId\(/, '').replace(/\)$/, '')
|
||||||
|
projectId = projectId.replace(/^ObjectId\(/, '').replace(/\)$/, '')
|
||||||
|
|
||||||
|
try {
|
||||||
|
docId = ObjectId(docId).toString()
|
||||||
|
projectId = ObjectId(projectId).toString()
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`Invalid id: ${docId}, ${projectId}`)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!orphanedDocs[projectId]) {
|
||||||
|
orphanedDocs[projectId] = []
|
||||||
|
}
|
||||||
|
|
||||||
|
orphanedDocs[projectId].push(docId)
|
||||||
|
})
|
||||||
|
|
||||||
|
rl.on('close', async () => {
|
||||||
|
const docCount = Object.values(orphanedDocs).reduce((i, v) => i + v.length, 0)
|
||||||
|
const projectCount = Object.keys(orphanedDocs).length
|
||||||
|
|
||||||
|
console.log(`Loaded Data for ${docCount} docs in ${projectCount} Projects`)
|
||||||
|
|
||||||
|
for (const projectId of Object.keys(orphanedDocs)) {
|
||||||
|
await deleteOrphanedDocs(projectId, orphanedDocs[projectId])
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('DONE')
|
||||||
|
process.exit()
|
||||||
|
})
|
||||||
|
|
||||||
|
async function deleteOrphanedDocs(projectId, docIds) {
|
||||||
|
try {
|
||||||
|
if (await projectIdExists(projectId)) {
|
||||||
|
console.error(`Project id exists: ${projectId}`)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`Error checking if project exists: ${projectId}`, err.stack)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Delete docs ${docIds.join(', ')} for project ${projectId}`)
|
||||||
|
|
||||||
|
if (!commit) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await DocstoreManager.destroyProject(projectId)
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`Error deleting project ${projectId}`, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function projectIdExists(projectId) {
|
||||||
|
// check both projects and deletedProjects to see if project id exists
|
||||||
|
const [project, deletedProject] = await Promise.all([
|
||||||
|
findProject(projectId),
|
||||||
|
findDeletedProject(projectId)
|
||||||
|
])
|
||||||
|
|
||||||
|
return project !== null || deletedProject !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
async function findProject(projectId) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
db.projects.findOne(
|
||||||
|
{ _id: ObjectId(projectId) },
|
||||||
|
{ _id: 1 },
|
||||||
|
(err, project) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err)
|
||||||
|
} else {
|
||||||
|
resolve(project)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async function findDeletedProject(projectId) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
db.deletedProjects.findOne(
|
||||||
|
{ 'project._id': ObjectId(projectId) },
|
||||||
|
{ _id: 1 },
|
||||||
|
(err, project) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err)
|
||||||
|
} else {
|
||||||
|
resolve(project)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
Loading…
Reference in a new issue