mirror of
https://github.com/overleaf/overleaf.git
synced 2024-12-03 09:44:03 -05:00
ee85d948e2
GitOrigin-RevId: ef2ef77e26df59d1af3df6dc664e284d3c70102d
246 lines
7.1 KiB
JavaScript
Executable file
246 lines
7.1 KiB
JavaScript
Executable file
#!/usr/bin/env node
|
|
|
|
'use strict'
|
|
|
|
/**
|
|
* This script fetches all history chunks from active projects (as listed in the
|
|
* active_doc_ids table) and counts how many times each blob is referenced. The
|
|
* reference count is stored in the blobs.estimated_reference_count column.
|
|
*/
|
|
|
|
const Path = require('path')
|
|
const BPromise = require('bluebird')
|
|
const commandLineArgs = require('command-line-args')
|
|
const config = require('config')
|
|
const stringToStream = require('string-to-stream')
|
|
|
|
const { History, EditFileOperation } = require('overleaf-editor-core')
|
|
const { knex, historyStore, persistor } = require('..')
|
|
|
|
const DEFAULT_BATCH_SIZE = 100
|
|
const DEFAULT_TIMEOUT = 23 * 60 * 60 // 23 hours
|
|
const MAX_POSTGRES_INTEGER = 2147483647
|
|
const TEXT_OPERATION_COUNT_THRESHOLD = 500
|
|
const BUCKET = config.get('analytics.bucket')
|
|
const BLOB_REFERENCE_COUNTS_PREFIX = 'blob-reference-counts/batches/'
|
|
const TEXT_OPERATION_COUNTS_PREFIX = 'text-operation-counts/'
|
|
|
|
async function main() {
|
|
const programName = Path.basename(process.argv[1])
|
|
const options = commandLineArgs([
|
|
{ name: 'restart', type: Boolean },
|
|
{ name: 'continue', type: Boolean },
|
|
{ name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
|
|
{ name: 'timeout', type: Number, defaultValue: DEFAULT_TIMEOUT },
|
|
{ name: 'concurrency', type: Number, defaultValue: 1 },
|
|
{ name: 'min-doc-id', type: Number, defaultValue: 1 },
|
|
{ name: 'max-doc-id', type: Number, defaultValue: MAX_POSTGRES_INTEGER },
|
|
])
|
|
const minDocId = options['min-doc-id']
|
|
const maxDocId = options['max-doc-id']
|
|
const runOptions = {
|
|
batchSize: options['batch-size'],
|
|
timeout: options.timeout,
|
|
concurrency: options.concurrency,
|
|
}
|
|
const inProgress = await isRunInProgress()
|
|
if (inProgress && !options.restart && !options.continue) {
|
|
console.log(`\
|
|
A blob reference count is already under way.
|
|
|
|
To resume this run, use: ${programName} --continue
|
|
To start a new run, use: ${programName} --restart`)
|
|
return
|
|
}
|
|
if (!inProgress || options.restart) {
|
|
await initialize()
|
|
}
|
|
const nextDocId = await getNextDocId(minDocId, maxDocId)
|
|
await run(nextDocId, maxDocId, runOptions)
|
|
}
|
|
|
|
async function isRunInProgress() {
|
|
const record = await knex('blob_reference_count_batches').first()
|
|
return record != null
|
|
}
|
|
|
|
async function getNextDocId(minDocId, maxDocId) {
|
|
const { lastDocId } = await knex('blob_reference_count_batches')
|
|
.where('end_doc_id', '<=', maxDocId)
|
|
.max({ lastDocId: 'end_doc_id' })
|
|
.first()
|
|
if (lastDocId == null) {
|
|
return minDocId
|
|
} else {
|
|
return Math.max(minDocId, lastDocId + 1)
|
|
}
|
|
}
|
|
|
|
async function initialize() {
|
|
await persistor.deleteDirectory(BUCKET, BLOB_REFERENCE_COUNTS_PREFIX)
|
|
await persistor.deleteDirectory(BUCKET, TEXT_OPERATION_COUNTS_PREFIX)
|
|
await knex('blob_reference_count_batches').truncate()
|
|
}
|
|
|
|
async function run(startDocId, maxDocId, options) {
|
|
const { timeout, batchSize, concurrency } = options
|
|
const maxRunningTime = Date.now() + timeout * 1000
|
|
let batchStart = startDocId
|
|
while (true) {
|
|
if (Date.now() > maxRunningTime) {
|
|
console.log('Timeout exceeded. Exiting early.')
|
|
break
|
|
}
|
|
const docIds = await getDocIds(batchStart, maxDocId, batchSize)
|
|
if (docIds.length === 0) {
|
|
console.log('No more projects to process. Bye!')
|
|
break
|
|
}
|
|
const batchEnd = docIds[docIds.length - 1]
|
|
console.log(`Processing doc ids ${batchStart} to ${batchEnd}...`)
|
|
const chunks = await getChunks(docIds)
|
|
const blobReferenceCounter = new BlobReferenceCounter()
|
|
const textOperationCounter = new TextOperationCounter()
|
|
await BPromise.map(
|
|
chunks,
|
|
async chunk => {
|
|
const history = await getHistory(chunk)
|
|
blobReferenceCounter.processHistory(history, chunk.projectId)
|
|
textOperationCounter.processHistory(history, chunk.projectId)
|
|
},
|
|
{ concurrency }
|
|
)
|
|
await storeBlobReferenceCounts(batchStart, blobReferenceCounter.getCounts())
|
|
await storeTextOperationCounts(batchStart, textOperationCounter.getCounts())
|
|
await recordBatch(batchStart, batchEnd)
|
|
batchStart = batchEnd + 1
|
|
}
|
|
}
|
|
|
|
async function getDocIds(minDocId, maxDocId, batchSize) {
|
|
const docIds = await knex('active_doc_ids')
|
|
.select('doc_id')
|
|
.where('doc_id', '>=', minDocId)
|
|
.andWhere('doc_id', '<=', maxDocId)
|
|
.orderBy('doc_id')
|
|
.limit(batchSize)
|
|
.pluck('doc_id')
|
|
return docIds
|
|
}
|
|
|
|
async function getChunks(docIds) {
|
|
const chunks = await knex('chunks')
|
|
.select('id', { projectId: 'doc_id' })
|
|
.where('doc_id', 'in', docIds)
|
|
return chunks
|
|
}
|
|
|
|
async function recordBatch(batchStart, batchEnd) {
|
|
await knex('blob_reference_count_batches').insert({
|
|
start_doc_id: batchStart,
|
|
end_doc_id: batchEnd,
|
|
})
|
|
}
|
|
|
|
async function getHistory(chunk) {
|
|
const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id)
|
|
const history = History.fromRaw(rawHistory)
|
|
return history
|
|
}
|
|
|
|
async function storeBlobReferenceCounts(startDocId, counts) {
|
|
const key = `${BLOB_REFERENCE_COUNTS_PREFIX}${startDocId}.csv`
|
|
const csv = makeCsvFromMap(counts)
|
|
const stream = stringToStream(csv)
|
|
persistor.sendStream(BUCKET, key, stream)
|
|
}
|
|
|
|
async function storeTextOperationCounts(startDocId, counts) {
|
|
const key = `${TEXT_OPERATION_COUNTS_PREFIX}${startDocId}.csv`
|
|
const csv = makeCsvFromMap(counts)
|
|
const stream = stringToStream(csv)
|
|
await persistor.sendStream(BUCKET, key, stream)
|
|
}
|
|
|
|
function makeCsvFromMap(map) {
|
|
const entries = Array.from(map.entries())
|
|
entries.sort((a, b) => {
|
|
if (a[0] < b[0]) {
|
|
return -1
|
|
}
|
|
if (a[0] > b[0]) {
|
|
return 1
|
|
}
|
|
return 0
|
|
})
|
|
return entries.map(entry => entry.join(',')).join('\n')
|
|
}
|
|
|
|
function incrementMapEntry(map, key) {
|
|
const currentCount = map.get(key) || 0
|
|
map.set(key, currentCount + 1)
|
|
}
|
|
|
|
class BlobReferenceCounter {
|
|
constructor() {
|
|
this.blobHashesByProjectId = new Map()
|
|
}
|
|
|
|
processHistory(history, projectId) {
|
|
let blobHashes = this.blobHashesByProjectId.get(projectId)
|
|
if (blobHashes == null) {
|
|
blobHashes = new Set()
|
|
this.blobHashesByProjectId.set(projectId, blobHashes)
|
|
}
|
|
history.findBlobHashes(blobHashes)
|
|
}
|
|
|
|
getCounts() {
|
|
const countsByHash = new Map()
|
|
for (const blobHashes of this.blobHashesByProjectId.values()) {
|
|
for (const hash of blobHashes) {
|
|
incrementMapEntry(countsByHash, hash)
|
|
}
|
|
}
|
|
return countsByHash
|
|
}
|
|
}
|
|
|
|
class TextOperationCounter {
|
|
constructor() {
|
|
this.countsByProjectId = new Map()
|
|
}
|
|
|
|
processHistory(history, projectId) {
|
|
for (const change of history.getChanges()) {
|
|
let textOperationCount = 0
|
|
for (const operation of change.getOperations()) {
|
|
if (operation instanceof EditFileOperation) {
|
|
textOperationCount++
|
|
}
|
|
}
|
|
if (textOperationCount >= TEXT_OPERATION_COUNT_THRESHOLD) {
|
|
this.countsByProjectId.set(
|
|
projectId,
|
|
Math.max(
|
|
this.countsByProjectId.get(projectId) || 0,
|
|
textOperationCount
|
|
)
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
getCounts() {
|
|
return this.countsByProjectId
|
|
}
|
|
}
|
|
|
|
main()
|
|
.then(() => {
|
|
process.exit()
|
|
})
|
|
.catch(err => {
|
|
console.error(err)
|
|
process.exit(1)
|
|
})
|