overleaf/services/web/scripts/extract_onboardingdatacollection_never_used_latex.js
roo hutton 172aeb59d1 Merge pull request #22260 from overleaf/rh-odc-students-never-used
Add student filter and country field to 'never used LaTeX' ODC script

GitOrigin-RevId: 9db298a48b7d70b59b81388ff93fc6c9575861d7
2024-12-03 09:04:59 +00:00

202 lines
5.6 KiB
JavaScript

const csv = require('csv')
const fs = require('fs')
const minimist = require('minimist')
const {
OnboardingDataCollection,
} = require('../app/src/models/OnboardingDataCollection')
const { User } = require('../app/src/models/User')
const SubscriptionLocator = require('../app/src/Features/Subscription/SubscriptionLocator')
const Settings = require('@overleaf/settings')
const { fetchJson } = require('@overleaf/fetch-utils')
/**
* This script extracts ODC data with some extra fields, and filters on registration date and LaTeX experience
*
* It will:
* — filter for used_latex=never
* — augment rows with user registered date and email addresses
* — filter on users registered after a certain date
* — export updated CSV
*
* Usage:
* - Locally:
* - docker compose exec web bash
* - node scripts/extract_onboardingdatacollection_never_used_latex.js
* - On the server:
* - rake connect:app[staging,web]
* - node scripts/extract_onboardingdatacollection_never_used_latex.js
* - exit
* - kubectl cp web-standalone-prod-XXXXX:/tmp/odc_neverUsedLatex.csv ~/odc_neverUsedLatex.csv
*/
function usage() {
console.log(
`
Onboarding Data Collection extraction, outputs to /tmp/odc_neverUsedLatex.csv
Usage:
node scripts/extract_onboardingdatacollection_never_used_latex.js [--registeredBefore=<date>] [--studentsOnly] [--includeSignUpDate] [--includeCountry] [--includePlanCode]
Options:
--help Show this screen
--registeredBefore=<date> Limit to users registered before ISO 8601 date (eg. 2024-08-01)
--studentsOnly Only include users whose primary occupation is 'university' or 'school'
--includeSignUpDate Include signUpDate column
--includeCountry Include countryCode column (inferred from institution and possibly missing)
--includePlanCode Include planCode column
`
)
}
function parseArgs() {
const argv = minimist(process.argv.slice(2), {
string: ['registeredBefore'],
bool: [
'help',
'studentsOnly',
'includeSignUpDate',
'includeCountry',
'includePlanCode',
],
default: {
help: false,
studentsOnly: false,
includeSignUpDate: false,
includeCountry: false,
includePlanCode: false,
registeredBefore: '2024-02-18',
},
})
if (argv.help) {
usage()
process.exit(0)
}
return argv
}
async function getEmails(userIds, { registeredBefore }) {
const userEmails = await User.find(
{ _id: { $in: userIds }, signUpDate: { $lte: new Date(registeredBefore) } },
{ email: 1, signUpDate: 1 }
).exec()
return userEmails.map(({ email, signUpDate }) => ({
email,
signUpDate: new Date(signUpDate).toISOString(),
}))
}
async function getUsers({ studentsOnly }) {
const odcCriteria = { usedLatex: 'never' }
if (studentsOnly) {
odcCriteria.primaryOccupation = 'university'
}
const cursor = OnboardingDataCollection.find(odcCriteria).cursor()
const userIds = []
const institutionNames = []
for (let doc = await cursor.next(); doc != null; doc = await cursor.next()) {
userIds.push(doc._id.toString())
institutionNames.push(doc.institutionName)
}
return { userIds, institutionNames }
}
async function getUserPlanCodes(users) {
const planCodes = []
for await (const user of users) {
const subscription =
await SubscriptionLocator.promises.getUsersSubscription(user)
planCodes.push(subscription?.planCode || 'free')
}
return planCodes
}
// inferred from institution so will not always be available or accurate
async function getUserCountries(institutions) {
const countryCodes = []
// cache any institutions we lookup to avoid making duplicate calls
const institutionLookups = {}
for await (const inst of institutions) {
if (!inst) {
countryCodes.push(undefined)
continue
}
if (institutionLookups[inst]) {
countryCodes.push(institutionLookups[inst])
continue
}
try {
const url = `${Settings.apis.web.url}/institutions/search?search=${encodeURIComponent(inst)}&max_results=1`
const response = await fetchJson(url)
countryCodes.push(response[0]?.country_code)
institutionLookups[inst] = response[0]?.country_code
} catch (e) {
// if institution search fails just move on
console.log(`Error when looking up institution ${inst}: ${e.message}`)
countryCodes.push(undefined)
}
}
return countryCodes
}
async function runScript() {
const columns = ['email']
const args = parseArgs()
if (args.includeSignUpDate) {
columns.push('signUpDate')
}
const users = await getUsers(args)
let userEmails = await getEmails(users.userIds, args)
if (args.includePlanCode) {
columns.push('planCode')
const planCodes = await getUserPlanCodes(users.userIds)
userEmails = userEmails.map((user, index) => {
user.planCode = planCodes[index]
return user
})
}
if (args.includeCountry) {
columns.push('country')
const countryCodes = await getUserCountries(users.institutionNames)
userEmails = userEmails.map((user, index) => {
user.country = countryCodes[index]
return user
})
}
console.log('Starting to write to csv file...')
csv.stringify(
userEmails,
{
header: true,
columns,
},
function (err, output) {
fs.writeFileSync('/tmp/odc_neverUsedLatex.csv', output)
if (err) {
console.log('error writing csv output: ', err)
process.exit(1)
}
process.exit()
}
)
}
runScript().catch(err => {
console.error(err)
process.exit(1)
})