More filtering, reuse find results

This commit is contained in:
2026-02-06 16:54:28 +09:00
parent 198d756de5
commit a3c27f16e7

233
run.sh
View File

@@ -1,41 +1,127 @@
#!/usr/bin/env bash
OUTPUT_DIR="out"
mkdir -p "$OUTPUT_DIR"
# TODO: all paths in cgi-bin should be considered a cgi-script, independent of file extension
echo "Starting webdoc analysis..."
if [[ ! -f "$OUTPUT_DIR/cgi-paths.txt" ]]; then
find \
/home/pvv/?/*/web-docs \
-type f \
\( \
-name '*.cgi' \
-o -name '*.php' \
-o -name '*.php3' \
-o -name '*.pm' \
-o -name '*.pl' \
-o -name '*.sh' \
-o -name '*.bash' \
-o -name '*.phtml' \
-o -name '*.shtml' \
-o -name '*.lisp' \
-o -name '*.cl' \
\) \
2>/dev/null \
| tee "$OUTPUT_DIR/cgi-paths.txt"
echo "Searching for CGI scripts..."
EXCLUDE_LOCATIONS=(
"*/.bundle/*"
"*/.bzr/*"
"*/.cache/*"
"*/.cargo/*"
"*/.git/*"
"*/.hg/*"
"*/.npm/*"
"*/.nvm/*"
"*/.rvm/*"
"*/.svn/*"
"*/.tox/*"
"*/__pycache__/*"
"*/node_modules/*"
"*/vendor/*"
)
FILE_NAME_PATTERNS=(
"*.cgi"
"*.php"
"*.php[0-9]*"
"*.pm"
"*.pl"
"*.sh"
"*.bash"
"*.phtml"
"*.shtml"
"*.lisp"
"*.cl"
)
FILE_NAME_ANTI_PATTERNS=(
"*.so"
"#*"
"*~"
)
FIND_ARGS=()
for d in /home/pvv/?/*/web-docs; do
if [[ -d "$d" ]]; then
FIND_ARGS+=("$d")
fi
done
if [[ ${#EXCLUDE_LOCATIONS[@]} -gt 0 ]]; then
FIND_ARGS+=( \( )
for ((i=0; i<${#EXCLUDE_LOCATIONS[@]}; i++)); do
FIND_ARGS+=(-path "${EXCLUDE_LOCATIONS[i]}")
if [[ $i -lt $(( ${#EXCLUDE_LOCATIONS[@]} - 1 )) ]]; then
FIND_ARGS+=(-o)
fi
done
FIND_ARGS+=( \) -prune -o )
fi
FIND_ARGS+=(
"-type" "f"
"-executable"
)
FIND_ARGS+=(\( )
for ((i=0; i<${#FILE_NAME_PATTERNS[@]}; i++)); do
pattern="${FILE_NAME_PATTERNS[i]}"
if [[ $i -gt 0 ]]; then
FIND_ARGS+=(-o)
fi
FIND_ARGS+=(-name "$pattern")
done
FIND_ARGS+=(\) )
for anti_pattern in "${FILE_NAME_ANTI_PATTERNS[@]}"; do
FIND_ARGS+=(-not -name "$anti_pattern")
done
FIND_ARGS+=("-print")
echo "find \\"
for arg in "${FIND_ARGS[@]}"; do
echo " '$arg' \\"
done
echo ""
find "${FIND_ARGS[@]}" 2>/dev/null | tee "$OUTPUT_DIR/cgi-paths.txt"
else
echo "'$OUTPUT_DIR/cgi-paths.txt' already exists, reusing..."
fi
if [[ ! -f out/perl-libs.txt ]]; then
find \
/home/pvv/?/*/web-docs \
-type f \
\( \
-name '*.pm' \
-o -name '*.pl' \
\) \
-exec rg '^use ([^;]+);' {} -N -o -r '$1' \; \
2>/dev/null \
| tee "$OUTPUT_DIR/perl-libs.txt"
##########################
# ANALYZE PERL LIBRARIES #
##########################
if [[ ! -f "$OUTPUT_DIR/perl-libs.txt" ]]; then
echo "Analyzing Perl libraries..."
: > "$OUTPUT_DIR/perl-libs.txt"
mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt"
PERL_PATHS=()
for p in "${ALL_CGI_PATHS[@]}"; do
case "$p" in
*.pm|*.pl)
PERL_PATHS+=("$p")
;;
esac
done
for p in "${PERL_PATHS[@]}"; do
rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/perl-libs.txt" || true
done
else
echo "'$OUTPUT_DIR/perl-libs.txt' already exists, reusing..."
fi
cat "$OUTPUT_DIR/perl-libs.txt" | sort > "$OUTPUT_DIR/perl-libs-tmp1.txt"
@@ -52,66 +138,67 @@ comm -2 -3 "$OUTPUT_DIR/perl-libs-tmp2.txt" <(cat PERL_STANDARD_MODULES.txt | so
# Remove pragmas
readarray -t PERL_PRAGMAS < PERL_PRAGMAS.txt
remove_pragmas_regex=$(printf '|^%s' "${PERL_PRAGMAS[@]}")
remove_pragmas_regex="${remove_pragmas_regex:1}" # remove leading '|
remove_pragmas_regex="${remove_pragmas_regex:1}" # remove leading '|'
sed -E "/${remove_pragmas_regex}/d" "$OUTPUT_DIR/perl-libs-tmp3.txt" > "$OUTPUT_DIR/perl-libs-filtered.txt"
cat "$OUTPUT_DIR/perl-libs-filtered.txt" | uniq -c | sort -gr > "$OUTPUT_DIR/perl-libs-overview.txt"
#########################
# ANALYZE PHP LIBRARIES #
#########################
if [[ ! -f "$OUTPUT_DIR/php-libs.txt" ]]; then
find \
/home/pvv/?/*/web-docs \
-type f \
\( \
-name '*.php' \
-o -name '*.php3' \
-o -name '*.phtml' \
\) \
-exec rg '^use ([^;]+);' {} -N -o -r '$1' \; \
2>/dev/null \
| tee "$OUTPUT_DIR/php-libs.txt"
echo "Analyzing PHP libraries..."
: > "$OUTPUT_DIR/php-libs.txt"
mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt"
PHP_PATHS=()
for p in "${ALL_CGI_PATHS[@]}"; do
case "$p" in
*.php[0-9]*|*.phtml)
PHP_PATHS+=("$p")
;;
esac
done
for p in "${PHP_PATHS[@]}"; do
rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/php-libs.txt" || true
done
else
echo "'$OUTPUT_DIR/php-libs.txt' already exists, reusing..."
fi
cat "$OUTPUT_DIR/php-libs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/php-libs-overview.txt"
########################
# ANALYZE CGI PROGRAMS #
########################
if [[ ! -f "$OUTPUT_DIR/cgi-progs.txt" ]]; then
echo "Analyzing CGI programs..."
: > "$OUTPUT_DIR/cgi-progs.txt"
mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt"
readarray -t BIN_LOCATIONS < BIN_LOCATIONS.txt
bin_locations_regex=$(printf '|%s' "${BIN_LOCATIONS[@]}")
bin_locations_regex="${bin_locations_regex:1}" # remove leading '|'
bin_locations_regex="(?:${bin_locations_regex})"
find \
/home/pvv/?/*/web-docs \
-type f \
\( \
-name '*.cgi' \
-o -name '*.php' \
-o -name '*.php3' \
-o -name '*.pm' \
-o -name '*.pl' \
-o -name '*.sh' \
-o -name '*.bash' \
-o -name '*.phtml' \
-o -name '*.shtml' \
-o -name '*.lisp' \
-o -name '*.cl' \
\) \
-exec rg "$bin_locations_regex/(?:env\s*)?(\w+(?:/\w+)*)" {} -N -o -r '$1' \; \
2>/dev/null \
| tee "$OUTPUT_DIR/cgi-progs.txt"
for p in "${ALL_CGI_PATHS[@]}"; do
rg "$bin_locations_regex/(?:env\\s*)?(\\w+(?:/\\w+)*)" --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/cgi-progs.txt" || true
done
# TODO: extract non-absolute binary references from calls that invoke subprocesses in the various languages
else
echo "'$OUTPUT_DIR/cgi-progs.txt' already exists, reusing..."
fi
cat "$OUTPUT_DIR/cgi-progs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/cgi-progs-overview.txt"
##########################
# ANALYZE LISP LIBRARIES #
##########################
# TODO find lisp libraries
# TODO: look through all files in ~/web-docs/cgi-bin (last one case insensitive)
# TODO: generally filter out files ending with ~ or starting with #
# TODO: generally filter out .git, RCS, and other such files
# TODO: generally filter out executables, drop executable .so files
# TODO: look for all digits behind '*.php' (i.e. asdf.php4 and asdf.php5)
# TODO: extract non-absolute binary references from calls that invoke subprocesses in the various languages