Merge pull request #11246 from overleaf/jpa-user-content-domain-access-check

[misc] prepare migration to user content domain

GitOrigin-RevId: 581ccab6d39ec021fb44a555a09e55441c35d0d1
This commit is contained in:
Jakob Ackermann 2023-01-17 15:23:51 +00:00 committed by Copybot
parent 4325f1d947
commit 59e587320a
15 changed files with 484 additions and 29 deletions

View file

@ -21,6 +21,32 @@ server {
text/plain log blg aux stdout stderr;
application/pdf pdf;
}
# user content domain access check
# The project-id is zero prefixed. No actual user project uses these ids.
# mongo-id 000000000000000000000000 -> 1970-01-01T00:00:00.000Z
# mongo-id 000000010000000000000000 -> 1970-01-01T00:00:01.000Z
# mongo-id 100000000000000000000000 -> 1978-07-04T21:24:16.000Z
# This allows us to distinguish between check-traffic and regular output traffic.
location ~ ^/project/0([0-9a-f]+)/user/([0-9a-f]+)/build/([0-9a-f-]+)/output/output\.pdf$ {
if ($request_method = 'OPTIONS') {
# handle OPTIONS method for CORS requests
add_header 'Content-Type' 'text/plain charset=UTF-8';
add_header 'Allow' 'GET,HEAD';
return 200 'GET,HEAD';
}
alias /var/clsi/tiny.pdf;
}
location ~ ^/project/0([0-9a-f]+)/build/([0-9a-f-]+)/output/output\.pdf$ {
if ($request_method = 'OPTIONS') {
# handle OPTIONS method for CORS requests
add_header 'Content-Type' 'text/plain charset=UTF-8';
add_header 'Allow' 'GET,HEAD';
return 200 'GET,HEAD';
}
alias /var/clsi/tiny.pdf;
}
# handle output files for specific users
location ~ ^/project/([0-9a-f]+)/user/([0-9a-f]+)/build/([0-9a-f-]+)/output/output\.([a-z]+)$ {
if ($request_method = 'OPTIONS') {

58
services/clsi/tiny.pdf Normal file
View file

@ -0,0 +1,58 @@
%PDF-1.1
%¥±ë
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
/MediaBox [0 0 300 144]
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/Resources
<< /Font
<< /F1
<< /Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
>>
>>
/Contents 4 0 R
>>
endobj
4 0 obj
<< /Length 55 >>
stream
BT
/F1 18 Tf
0 0 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000018 00000 n
0000000077 00000 n
0000000178 00000 n
0000000457 00000 n
trailer
<< /Root 1 0 R
/Size 5
>>
startxref
565
%%EOF

View file

@ -38,12 +38,7 @@ async function getPdfCachingMinChunkSize(req, res) {
return parseInt(variant, 10)
}
const getPdfCachingOptions = callbackify(async function (req, res) {
if (!req.query.enable_pdf_caching) {
// The frontend does not want to do pdf caching.
return { enablePdfCaching: false }
}
const getSplitTestOptions = callbackify(async function (req, res) {
// Use the query flags from the editor request for overriding the split test.
let query = {}
try {
@ -52,6 +47,22 @@ const getPdfCachingOptions = callbackify(async function (req, res) {
} catch (e) {}
const editorReq = { ...req, query }
const { variant: domainVariant } =
await SplitTestHandler.promises.getAssignment(
editorReq,
res,
'pdf-download-domain'
)
const pdfDownloadDomain =
domainVariant === 'user' && Settings.compilesUserContentDomain
? Settings.compilesUserContentDomain
: Settings.pdfDownloadDomain
if (!req.query.enable_pdf_caching) {
// The frontend does not want to do pdf caching.
return { pdfDownloadDomain, enablePdfCaching: false }
}
// Double check with the latest split test assignment.
// We may need to turn off the feature on a short notice, without requiring
// all users to reload their editor page to disable the feature.
@ -63,13 +74,10 @@ const getPdfCachingOptions = callbackify(async function (req, res) {
const enablePdfCaching = variant === 'enabled'
if (!enablePdfCaching) {
// Skip the lookup of the chunk size when caching is not enabled.
return { enablePdfCaching: false }
return { pdfDownloadDomain, enablePdfCaching: false }
}
const pdfCachingMinChunkSize = await getPdfCachingMinChunkSize(editorReq, res)
return {
enablePdfCaching,
pdfCachingMinChunkSize,
}
return { pdfDownloadDomain, enablePdfCaching, pdfCachingMinChunkSize }
})
module.exports = CompileController = {
@ -108,9 +116,10 @@ module.exports = CompileController = {
options.incrementalCompilesEnabled = true
}
getPdfCachingOptions(req, res, (err, pdfCachingOptions) => {
getSplitTestOptions(req, res, (err, splitTestOptions) => {
if (err) return next(err)
const { enablePdfCaching, pdfCachingMinChunkSize } = pdfCachingOptions
let { enablePdfCaching, pdfCachingMinChunkSize, pdfDownloadDomain } =
splitTestOptions
options.enablePdfCaching = enablePdfCaching
if (enablePdfCaching) {
options.pdfCachingMinChunkSize = pdfCachingMinChunkSize
@ -136,7 +145,6 @@ module.exports = CompileController = {
return next(error)
}
Metrics.inc('compile-status', 1, { status })
let pdfDownloadDomain = Settings.pdfDownloadDomain
if (pdfDownloadDomain && outputUrlPrefix) {
pdfDownloadDomain += outputUrlPrefix
}

View file

@ -1086,6 +1086,28 @@ const ProjectController = {
}
)
},
userContentDomainAccessCheckAssigment(cb) {
SplitTestHandler.getAssignment(
req,
res,
'user-content-domain-access-check',
() => {
// We'll pick up the assignment from the res.locals assignment.
cb()
}
)
},
reportUserContentDomainAccessCheckErrorAssigment(cb) {
SplitTestHandler.getAssignment(
req,
res,
'report-user-content-domain-access-check-error',
() => {
// We'll pick up the assignment from the res.locals assignment.
cb()
}
)
},
recompileButtonTextAssignment: [
'user',
(results, cb) => {

View file

@ -0,0 +1,21 @@
const Metrics = require('@overleaf/metrics')
function recordCheckResult(req, res) {
Metrics.count('user_content_domain_check', req.body.succeeded, 1, {
status: 'success',
})
Metrics.count('user_content_domain_check', req.body.failed, 1, {
status: 'failure',
})
res.sendStatus(204)
}
function recordFallbackUsage(_req, res) {
Metrics.inc('user_content_domain_fallback')
res.sendStatus(204)
}
module.exports = {
recordCheckResult,
recordFallbackUsage,
}

View file

@ -66,6 +66,7 @@ const _ = require('underscore')
const { expressify } = require('./util/promises')
const { plainTextResponse } = require('./infrastructure/Response')
const PublicAccessLevels = require('./Features/Authorization/PublicAccessLevels')
const UserContentDomainController = require('./Features/UserContentDomainCheck/UserContentDomainController')
module.exports = { initialize }
@ -1304,6 +1305,31 @@ function initialize(webRouter, privateApiRouter, publicApiRouter) {
res.sendStatus(204)
})
webRouter.post(
'/record-user-content-domain-access-check-result',
validate({
body: Joi.object({
failed: Joi.number().min(0).max(6),
succeeded: Joi.number().min(0).max(6),
}),
}),
RateLimiterMiddleware.rateLimit({
endpointName: 'user-content-domain-a-c-r',
maxRequests: 15,
timeInterval: 60,
}),
UserContentDomainController.recordCheckResult
)
webRouter.post(
'/record-user-content-domain-fallback-usage',
RateLimiterMiddleware.rateLimit({
endpointName: 'user-content-domain-fb-u',
maxRequests: 15,
timeInterval: 60,
}),
UserContentDomainController.recordFallbackUsage
)
webRouter.get(
`/read/:token(${TokenAccessController.READ_ONLY_TOKEN_PATTERN})`,
RateLimiterMiddleware.rateLimit({

View file

@ -12,6 +12,8 @@ meta(name="ol-isRestrictedTokenMember" data-type="boolean" content=isRestrictedT
meta(name="ol-maxDocLength" data-type="json" content=maxDocLength)
meta(name="ol-wikiEnabled" data-type="boolean" content=!!(settings.apis.wiki && settings.apis.wiki.url))
meta(name="ol-gitBridgePublicBaseUrl" content=gitBridgePublicBaseUrl)
meta(name="ol-compilesUserContentDomain" content=settings.compilesUserContentDomain)
meta(name="ol-fallbackCompileDomain" content=settings.pdfDownloadDomain)
//- Set base path for Ace scripts loaded on demand/workers and don't use cdn
meta(name="ol-aceBasePath" content="/js/" + lib('ace'))
//- enable doc hash checking for all projects

View file

@ -542,6 +542,7 @@ module.exports = {
// Domain the client (pdfjs) should download the compiled pdf from
pdfDownloadDomain: process.env.PDF_DOWNLOAD_DOMAIN, // "http://clsi-lb:3014"
compilesUserContentDomain: process.env.COMPILES_USER_CONTENT_DOMAIN,
// By default turn on feature flag, can be overridden per request.
enablePdfCaching: process.env.ENABLE_PDF_CACHING === 'true',

View file

@ -0,0 +1,51 @@
import { isNetworkError } from '../../../utils/isNetworkError'
import getMeta from '../../../utils/meta'
import OError from '@overleaf/o-error'
import { postJSON } from '../../../infrastructure/fetch-json'
let useFallbackDomainUntil = performance.now()
const ONE_HOUR_IN_MS = 1000 * 60 * 60
export async function fetchFromCompileDomain(url: string, init: RequestInit) {
const userContentDomain = getMeta('ol-compilesUserContentDomain')
let isUserContentDomain =
userContentDomain &&
new URL(url).hostname === new URL(userContentDomain).hostname
if (useFallbackDomainUntil > performance.now()) {
isUserContentDomain = false
url = withFallbackCompileDomain(url)
}
try {
return await fetch(url, init)
} catch (err) {
if (isNetworkError(err) && isUserContentDomain) {
try {
const res = await fetch(withFallbackCompileDomain(url), init)
// Only switch to the fallback when fetch does not throw there as well.
if (useFallbackDomainUntil < performance.now()) {
useFallbackDomainUntil = performance.now() + ONE_HOUR_IN_MS
recordFallbackUsage()
}
return res
} catch (err2: any) {
throw OError.tag(err2, 'fallback request failed', {
errUserContentDomain: err,
})
}
}
throw err
}
}
function withFallbackCompileDomain(url: string) {
const u = new URL(url)
u.hostname = new URL(getMeta('ol-fallbackCompileDomain')).hostname
return u.href
}
function recordFallbackUsage() {
setTimeout(() => {
postJSON('/record-user-content-domain-fallback-usage').catch(() => {})
}, 1_000)
}

View file

@ -3,6 +3,7 @@ import HumanReadableLogs from '../../../ide/human-readable-logs/HumanReadableLog
import BibLogParser from '../../../ide/log-parser/bib-log-parser'
import { v4 as uuid } from 'uuid'
import { enablePdfCaching } from './pdf-caching-flags'
import { fetchFromCompileDomain } from './fetchFromCompileDomain'
// Warnings that may disappear after a second LaTeX pass
const TRANSIENT_WARNING_REGEX = /^(Reference|Citation).+undefined on input line/
@ -69,9 +70,10 @@ export const handleLogFiles = async (outputFiles, data, signal) => {
if (logFile) {
try {
const response = await fetch(buildURL(logFile, data.pdfDownloadDomain), {
signal,
})
const response = await fetchFromCompileDomain(
buildURL(logFile, data.pdfDownloadDomain),
{ signal }
)
result.log = await response.text()
@ -99,9 +101,10 @@ export const handleLogFiles = async (outputFiles, data, signal) => {
if (blgFile) {
try {
const response = await fetch(buildURL(blgFile, data.pdfDownloadDomain), {
signal,
})
const response = await fetchFromCompileDomain(
buildURL(blgFile, data.pdfDownloadDomain),
{ signal }
)
const log = await response.text()
@ -163,7 +166,7 @@ function buildURL(file, pdfDownloadDomain) {
return `${pdfDownloadDomain}${file.url}`
}
// Go through web instead, which uses mongo for checking project access.
return file.url
return `${window.origin}${file.url}`
}
function normalizeFilePath(path, rootDocDirname) {

View file

@ -1,10 +1,11 @@
import OError from '@overleaf/o-error'
import { fetchFromCompileDomain } from './fetchFromCompileDomain'
const PDF_JS_CHUNK_SIZE = 128 * 1024
const MAX_SUB_REQUEST_COUNT = 4
const MAX_SUB_REQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
const SAMPLE_NGINX_BOUNDARY = '00000000000000000001'
const HEADER_OVERHEAD_PER_MULTI_PART_CHUNK = composeMultipartHeader({
export const HEADER_OVERHEAD_PER_MULTI_PART_CHUNK = composeMultipartHeader({
boundary: SAMPLE_NGINX_BOUNDARY,
// Assume an upper bound of O(9GB) for the pdf size.
start: 9 * 1024 * 1024 * 1024,
@ -73,7 +74,7 @@ function preprocessFileOnce({ file, usageScore, cachedUrls }) {
/**
* @param {Array} chunks
*/
function estimateSizeOfMultipartResponse(chunks) {
export function estimateSizeOfMultipartResponse(chunks) {
/*
--boundary
HEADER
@ -357,7 +358,7 @@ function getResponseSize(response) {
* @param {Response} response
* @param chunk
*/
function getMultipartBoundary(response, chunk) {
export function getMultipartBoundary(response, chunk) {
if (!Array.isArray(chunk)) return ''
const raw = response.headers.get('Content-Type')
@ -392,7 +393,13 @@ function composeMultipartHeader({ boundary, start, end, size }) {
* @param {string} boundary
* @param {Object} metrics
*/
function resolveMultiPartResponses({ file, chunks, data, boundary, metrics }) {
export function resolveMultiPartResponses({
file,
chunks,
data,
boundary,
metrics,
}) {
const responses = []
let offsetStart = 0
const encoder = new TextEncoder()
@ -439,7 +446,7 @@ function resolveMultiPartResponses({ file, chunks, data, boundary, metrics }) {
* @param {number} estimatedSize
* @param {RequestInit} init
*/
function checkChunkResponse(response, estimatedSize, init) {
export function checkChunkResponse(response, estimatedSize, init) {
if (!(response.status === 206 || response.status === 200)) {
throw new OError('non successful response status: ' + response.status, {
responseHeaders: Object.fromEntries(response.headers.entries()),
@ -477,7 +484,7 @@ export async function fallbackRequest({ url, start, end, abortSignal }) {
headers: { Range: `bytes=${start}-${end - 1}` },
signal: abortSignal,
}
const response = await fetch(url, init)
const response = await fetchFromCompileDomain(url, init)
checkChunkResponse(response, end - start, init)
return await response.arrayBuffer()
} catch (e) {
@ -556,7 +563,7 @@ async function fetchChunk({
// result all the browser cache keys (aka urls) get invalidated.
// We memorize the previous browser cache keys in `cachedUrls`.
try {
const response = await fetch(oldUrl, init)
const response = await fetchFromCompileDomain(oldUrl, init)
if (response.status === 200) {
checkChunkResponse(response, estimatedSize, init)
metrics.oldUrlHitCount += 1
@ -571,7 +578,7 @@ async function fetchChunk({
// Fallback to the latest url.
}
}
const response = await fetch(url, init)
const response = await fetchFromCompileDomain(url, init)
checkChunkResponse(response, estimatedSize, init)
if (chunk.hash) cachedUrls.set(chunk.hash, url)
return response

View file

@ -0,0 +1,215 @@
import {
checkChunkResponse,
estimateSizeOfMultipartResponse,
getMultipartBoundary,
resolveMultiPartResponses,
} from '../pdf-preview/util/pdf-caching'
import getMeta from '../../utils/meta'
import OError from '@overleaf/o-error'
import { captureException } from '../../infrastructure/error-reporter'
import { postJSON } from '../../infrastructure/fetch-json'
import isSplitTestEnabled from '../../utils/isSplitTestEnabled'
const INITIAL_DELAY_MS = 30_000
const DELAY_BETWEEN_PROBES_MS = 1_000
const TIMEOUT_MS = 30_000
const FULL_SIZE = 739
const FULL_HASH =
'b7d25591c18da373709d3d88ddf5eeab0b5089359e580f051314fd8935df0b73'
const CHUNKS = [
{
start: 0,
end: 21,
hash: 'd2ad9cbf1bc669646c0dfc43fa3167d30ab75077bb46bc9e3624b9e7e168abc2',
},
{
start: 21,
end: 42,
hash: 'd6d110ec0f3f4e27a4050bc2be9c5552cc9092f86b74fec75072c2c9e8483454',
},
{
start: 42,
end: 64,
hash: '8278914487a3a099c9af5aa22ed836d6587ca0beb7bf9a059fb0409667b3eb3d',
},
]
async function sleep(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms))
}
function pickZone() {
const x = Math.random()
switch (true) {
case x > 0.66:
return 'b'
case x > 0.33:
return 'c'
default:
return 'd'
}
}
function arrayLikeToHex(a: Uint8Array) {
return Array.from(a)
.map(i => i.toString(16).padStart(2, '0'))
.join('')
}
async function hashBody(body: ArrayBuffer) {
const digest = await crypto.subtle.digest('SHA-256', body)
return arrayLikeToHex(new Uint8Array(digest))
}
async function checkHash(
res: Response,
data: ArrayBuffer,
expectedHash: string
) {
const actualHash = await hashBody(data)
if (actualHash !== expectedHash) {
throw new OError('content hash mismatch', {
actualHash,
expectedHash,
headers: Object.fromEntries(res.headers.entries()),
})
}
}
function randomHex(bytes: number) {
const buf = new Uint8Array(bytes)
crypto.getRandomValues(buf)
return arrayLikeToHex(buf)
}
function genBuildId() {
const date = Date.now().toString(16)
const random = randomHex(8)
return `${date}-${random}`
}
async function singleCheck(
url: string,
init: RequestInit,
estimatedSize: number,
expectedHash: string,
chunks?: Array<any>
) {
const ac = new AbortController()
setTimeout(() => ac.abort(), TIMEOUT_MS)
init.signal = ac.signal
init.cache = 'no-store'
const res = await fetch(url, init)
checkChunkResponse(res, estimatedSize, init)
const body = await res.arrayBuffer()
if (chunks) {
const boundary = getMultipartBoundary(res, chunks)
const parts = resolveMultiPartResponses({
file: { size: FULL_SIZE },
chunks,
data: new Uint8Array(body),
boundary,
metrics: {},
})
for (const part of parts) {
await checkHash(res, part.data, part.chunk.hash)
}
} else {
await checkHash(res, body, expectedHash)
}
}
export async function checkUserContentDomainAccess() {
// Note: The ids are zero prefixed. No actual user/project uses these ids.
// mongo-id 000000000000000000000000 -> 1970-01-01T00:00:00.000Z
// mongo-id 000000010000000000000000 -> 1970-01-01T00:00:01.000Z
// mongo-id 100000000000000000000000 -> 1978-07-04T21:24:16.000Z
// This allows us to distinguish between check-traffic and regular output
// traffic.
const projectId = `0${randomHex(12).slice(1)}`
const userId = `0${randomHex(12).slice(1)}`
const buildId = genBuildId()
const zone = pickZone()
const urls = [
`${getMeta(
'ol-compilesUserContentDomain'
)}/zone/${zone}/project/${projectId}/user/${userId}/build/${buildId}/output/output.pdf`,
`${getMeta(
'ol-compilesUserContentDomain'
)}/zone/${zone}/project/${projectId}/build/${buildId}/output/output.pdf`,
]
const cases = []
for (const url of urls) {
// full download
cases.push({
url,
init: {},
estimatedSize: FULL_SIZE,
hash: FULL_HASH,
})
// range request
const chunk = CHUNKS[0]
cases.push({
url,
init: {
headers: {
Range: `bytes=${chunk.start}-${chunk.end - 1}`,
},
},
estimatedSize: chunk.end - chunk.start,
hash: chunk.hash,
})
// multipart request
cases.push({
url,
init: {
headers: {
Range: `bytes=${CHUNKS.map(c => `${c.start}-${c.end - 1}`).join(
','
)}`,
},
},
estimatedSize: estimateSizeOfMultipartResponse(CHUNKS),
hash: chunk.hash,
chunks: CHUNKS,
})
}
let failed = 0
for (const { url, init, estimatedSize, hash, chunks } of cases) {
await sleep(DELAY_BETWEEN_PROBES_MS)
try {
await singleCheck(url, init, estimatedSize, hash, chunks)
} catch (err: any) {
failed++
OError.tag(err, 'user-content-domain-access-check failed', {
url,
init,
})
if (isSplitTestEnabled('report-user-content-domain-access-check-error')) {
captureException(err)
} else {
console.error(OError.getFullStack(err), OError.getFullInfo(err))
}
}
}
try {
await postJSON('/record-user-content-domain-access-check-result', {
body: { failed, succeeded: cases.length - failed },
})
} catch (e) {}
}
export function scheduleUserContentDomainAccessCheck() {
sleep(INITIAL_DELAY_MS).then(() => {
checkUserContentDomainAccess().catch(err => {
captureException(err)
})
})
}

View file

@ -69,6 +69,8 @@ import './features/source-editor/controllers/grammarly-warning-controller'
import { cleanupServiceWorker } from './utils/service-worker-cleanup'
import { reportCM6Perf } from './infrastructure/cm6-performance'
import { reportAcePerf } from './ide/editor/ace-performance'
import { scheduleUserContentDomainAccessCheck } from './features/user-content-domain-access-check'
import isSplitTestEnabled from './utils/isSplitTestEnabled'
App.controller(
'IdeController',
@ -479,6 +481,9 @@ If the project has been renamed please look in your project list for a new proje
)
cleanupServiceWorker()
if (isSplitTestEnabled('user-content-domain-access-check')) {
scheduleUserContentDomainAccessCheck()
}
angular.module('SharelatexApp').config(function ($provide) {
$provide.decorator('$browser', [

View file

@ -0,0 +1,5 @@
import getMeta from './meta'
export default function isSplitTestEnabled(name: string) {
return getMeta('ol-splitTestVariants')?.[name] === 'enabled'
}

View file

@ -62,6 +62,11 @@ describe('CompileController', function () {
getAssignment: (this.getAssignment = sinon.stub().yields(null, {
variant: 'default',
})),
promises: {
getAssignment: sinon.stub().resolves({
variant: 'default',
}),
},
},
'../Analytics/AnalyticsManager': {
recordEventForSession: sinon.stub(),