From a3c27f16e7b3c4b28fd92aa7c33e7c5f313fef2e Mon Sep 17 00:00:00 2001 From: h7x4 Date: Fri, 6 Feb 2026 16:54:28 +0900 Subject: [PATCH] More filtering, reuse `find` results --- run.sh | 233 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 160 insertions(+), 73 deletions(-) diff --git a/run.sh b/run.sh index c1de822..17dbc00 100755 --- a/run.sh +++ b/run.sh @@ -1,41 +1,127 @@ #!/usr/bin/env bash OUTPUT_DIR="out" - mkdir -p "$OUTPUT_DIR" +# TODO: all paths in cgi-bin should be considered a cgi-script, independent of file extension + +echo "Starting webdoc analysis..." + if [[ ! -f "$OUTPUT_DIR/cgi-paths.txt" ]]; then - find \ - /home/pvv/?/*/web-docs \ - -type f \ - \( \ - -name '*.cgi' \ - -o -name '*.php' \ - -o -name '*.php3' \ - -o -name '*.pm' \ - -o -name '*.pl' \ - -o -name '*.sh' \ - -o -name '*.bash' \ - -o -name '*.phtml' \ - -o -name '*.shtml' \ - -o -name '*.lisp' \ - -o -name '*.cl' \ - \) \ - 2>/dev/null \ - | tee "$OUTPUT_DIR/cgi-paths.txt" + echo "Searching for CGI scripts..." + + EXCLUDE_LOCATIONS=( + "*/.bundle/*" + "*/.bzr/*" + "*/.cache/*" + "*/.cargo/*" + "*/.git/*" + "*/.hg/*" + "*/.npm/*" + "*/.nvm/*" + "*/.rvm/*" + "*/.svn/*" + "*/.tox/*" + "*/__pycache__/*" + "*/node_modules/*" + "*/vendor/*" + ) + + FILE_NAME_PATTERNS=( + "*.cgi" + "*.php" + "*.php[0-9]*" + "*.pm" + "*.pl" + "*.sh" + "*.bash" + "*.phtml" + "*.shtml" + "*.lisp" + "*.cl" + ) + + FILE_NAME_ANTI_PATTERNS=( + "*.so" + "#*" + "*~" + ) + + + FIND_ARGS=() + for d in /home/pvv/?/*/web-docs; do + if [[ -d "$d" ]]; then + FIND_ARGS+=("$d") + fi + done + + if [[ ${#EXCLUDE_LOCATIONS[@]} -gt 0 ]]; then + FIND_ARGS+=( \( ) + for ((i=0; i<${#EXCLUDE_LOCATIONS[@]}; i++)); do + FIND_ARGS+=(-path "${EXCLUDE_LOCATIONS[i]}") + if [[ $i -lt $(( ${#EXCLUDE_LOCATIONS[@]} - 1 )) ]]; then + FIND_ARGS+=(-o) + fi + done + FIND_ARGS+=( \) -prune -o ) + fi + + FIND_ARGS+=( + "-type" "f" + "-executable" + ) + + FIND_ARGS+=(\( ) + for ((i=0; i<${#FILE_NAME_PATTERNS[@]}; i++)); do + pattern="${FILE_NAME_PATTERNS[i]}" + if [[ $i -gt 0 ]]; then + FIND_ARGS+=(-o) + fi + FIND_ARGS+=(-name "$pattern") + done + FIND_ARGS+=(\) ) + + for anti_pattern in "${FILE_NAME_ANTI_PATTERNS[@]}"; do + FIND_ARGS+=(-not -name "$anti_pattern") + done + + FIND_ARGS+=("-print") + + echo "find \\" + for arg in "${FIND_ARGS[@]}"; do + echo " '$arg' \\" + done + echo "" + + find "${FIND_ARGS[@]}" 2>/dev/null | tee "$OUTPUT_DIR/cgi-paths.txt" +else + echo "'$OUTPUT_DIR/cgi-paths.txt' already exists, reusing..." fi -if [[ ! -f out/perl-libs.txt ]]; then - find \ - /home/pvv/?/*/web-docs \ - -type f \ - \( \ - -name '*.pm' \ - -o -name '*.pl' \ - \) \ - -exec rg '^use ([^;]+);' {} -N -o -r '$1' \; \ - 2>/dev/null \ - | tee "$OUTPUT_DIR/perl-libs.txt" +########################## +# ANALYZE PERL LIBRARIES # +########################## + +if [[ ! -f "$OUTPUT_DIR/perl-libs.txt" ]]; then + echo "Analyzing Perl libraries..." + + : > "$OUTPUT_DIR/perl-libs.txt" + mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" + + PERL_PATHS=() + for p in "${ALL_CGI_PATHS[@]}"; do + case "$p" in + *.pm|*.pl) + PERL_PATHS+=("$p") + ;; + esac + done + + for p in "${PERL_PATHS[@]}"; do + rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/perl-libs.txt" || true + done +else + echo "'$OUTPUT_DIR/perl-libs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/perl-libs.txt" | sort > "$OUTPUT_DIR/perl-libs-tmp1.txt" @@ -52,66 +138,67 @@ comm -2 -3 "$OUTPUT_DIR/perl-libs-tmp2.txt" <(cat PERL_STANDARD_MODULES.txt | so # Remove pragmas readarray -t PERL_PRAGMAS < PERL_PRAGMAS.txt remove_pragmas_regex=$(printf '|^%s' "${PERL_PRAGMAS[@]}") -remove_pragmas_regex="${remove_pragmas_regex:1}" # remove leading '| +remove_pragmas_regex="${remove_pragmas_regex:1}" # remove leading '|' sed -E "/${remove_pragmas_regex}/d" "$OUTPUT_DIR/perl-libs-tmp3.txt" > "$OUTPUT_DIR/perl-libs-filtered.txt" cat "$OUTPUT_DIR/perl-libs-filtered.txt" | uniq -c | sort -gr > "$OUTPUT_DIR/perl-libs-overview.txt" +######################### +# ANALYZE PHP LIBRARIES # +######################### + if [[ ! -f "$OUTPUT_DIR/php-libs.txt" ]]; then - find \ - /home/pvv/?/*/web-docs \ - -type f \ - \( \ - -name '*.php' \ - -o -name '*.php3' \ - -o -name '*.phtml' \ - \) \ - -exec rg '^use ([^;]+);' {} -N -o -r '$1' \; \ - 2>/dev/null \ - | tee "$OUTPUT_DIR/php-libs.txt" + echo "Analyzing PHP libraries..." + + : > "$OUTPUT_DIR/php-libs.txt" + mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" + + PHP_PATHS=() + for p in "${ALL_CGI_PATHS[@]}"; do + case "$p" in + *.php[0-9]*|*.phtml) + PHP_PATHS+=("$p") + ;; + esac + done + + for p in "${PHP_PATHS[@]}"; do + rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/php-libs.txt" || true + done +else + echo "'$OUTPUT_DIR/php-libs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/php-libs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/php-libs-overview.txt" +######################## +# ANALYZE CGI PROGRAMS # +######################## + if [[ ! -f "$OUTPUT_DIR/cgi-progs.txt" ]]; then + echo "Analyzing CGI programs..." + + : > "$OUTPUT_DIR/cgi-progs.txt" + mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" + readarray -t BIN_LOCATIONS < BIN_LOCATIONS.txt bin_locations_regex=$(printf '|%s' "${BIN_LOCATIONS[@]}") bin_locations_regex="${bin_locations_regex:1}" # remove leading '|' bin_locations_regex="(?:${bin_locations_regex})" - find \ - /home/pvv/?/*/web-docs \ - -type f \ - \( \ - -name '*.cgi' \ - -o -name '*.php' \ - -o -name '*.php3' \ - -o -name '*.pm' \ - -o -name '*.pl' \ - -o -name '*.sh' \ - -o -name '*.bash' \ - -o -name '*.phtml' \ - -o -name '*.shtml' \ - -o -name '*.lisp' \ - -o -name '*.cl' \ - \) \ - -exec rg "$bin_locations_regex/(?:env\s*)?(\w+(?:/\w+)*)" {} -N -o -r '$1' \; \ - 2>/dev/null \ - | tee "$OUTPUT_DIR/cgi-progs.txt" + for p in "${ALL_CGI_PATHS[@]}"; do + rg "$bin_locations_regex/(?:env\\s*)?(\\w+(?:/\\w+)*)" --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/cgi-progs.txt" || true + done + + # TODO: extract non-absolute binary references from calls that invoke subprocesses in the various languages +else + echo "'$OUTPUT_DIR/cgi-progs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/cgi-progs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/cgi-progs-overview.txt" +########################## +# ANALYZE LISP LIBRARIES # +########################## + # TODO find lisp libraries - -# TODO: look through all files in ~/web-docs/cgi-bin (last one case insensitive) - -# TODO: generally filter out files ending with ~ or starting with # - -# TODO: generally filter out .git, RCS, and other such files - -# TODO: generally filter out executables, drop executable .so files - -# TODO: look for all digits behind '*.php' (i.e. asdf.php4 and asdf.php5) - -# TODO: extract non-absolute binary references from calls that invoke subprocesses in the various languages