Files
delete-your-element/scripts/text-probability.js
2025-11-15 16:41:02 +13:00

66 lines
2.3 KiB
JavaScript

// @ts-check
const Ty = require("../src/types")
const fs = require("fs")
const domino = require("domino")
const repl = require("repl")
const pres = (() => {
const pres = []
for (const file of process.argv.slice(2)) {
const data = JSON.parse(fs.readFileSync(file, "utf8"))
/** @type {Ty.Event.Outer<{msgtype?: string}>[]} */
const events = data.messages
for (const event of events) {
if (event.type !== "m.room.message" || event.content.msgtype !== "m.text") continue
/** @type {Ty.Event.M_Room_Message} */ // @ts-ignore
const content = event.content
if (content.format !== "org.matrix.custom.html") continue
if (!content.formatted_body) continue
const document = domino.createDocument(content.formatted_body)
// @ts-ignore
for (const pre of document.querySelectorAll("pre").cache) {
const content = pre.textContent
if (content.length < 100) continue
pres.push(content)
}
}
}
return pres
})()
// @ts-ignore
global.gc()
/** @param {string} text */
function probablyFixedWidthIntended(text) {
// if internal spaces are used, seems like they want a fixed-width font
if (text.match(/[^ ] {3,}[^ ]/)) return true
// if characters from Unicode General_Category "Symbol, other" are used, seems like they're doing ascii art and they want a fixed-width font
if (text.match(/\p{So}/v)) return true
// check start of line indentation
let indents = new Set()
for (const line of text.trimEnd().split("\n")) {
indents.add(line.match(/^ */)?.[0].length || 0)
// if there are more than 3 different indents (counting 0) then it's code
if (indents.size >= 3) return true
}
// if everything is indented then it's code
if (!indents.has(0)) return true
// if there is a high proportion of symbols then it's code (this filter works remarkably well on its own)
if ([...text.matchAll(/[\\`~;+|<>%$@*&"'=(){}[\]_^]|\.[a-zA-Z]|[a-z][A-Z]/g)].length / text.length >= 0.04) return true
return false
}
Object.assign(repl.start().context, {pres, probablyFixedWidthIntended})
/*
if it has a lot of symbols then it's code
if it has >=3 levels of indentation then it's code
if it is all indented then it's code
if it has many spaces in a row in the middle then it's ascii art
if it has many non-latin characters then it's language
-> except if they are ascii art characters e.g. ⣿⣿⡇⢸⣿⠃ then it's ascii art
*/