#!/usr/bin/env bash OUTPUT_DIR="out" mkdir -p "$OUTPUT_DIR" # TODO: all paths in cgi-bin should be considered a cgi-script, independent of file extension echo "Starting webdoc analysis..." if [[ ! -f "$OUTPUT_DIR/cgi-paths.txt" ]]; then echo "Searching for CGI scripts..." EXCLUDE_LOCATIONS=( "*/.bundle/*" "*/.bzr/*" "*/.cache/*" "*/.cargo/*" "*/.git/*" "*/.hg/*" "*/.npm/*" "*/.nvm/*" "*/.rvm/*" "*/.svn/*" "*/.tox/*" "*/__pycache__/*" "*/node_modules/*" "*/vendor/*" ) FILE_NAME_PATTERNS=( "*.cgi" "*.php" "*.php[0-9]*" "*.pm" "*.pl" "*.sh" "*.bash" "*.phtml" "*.shtml" "*.lisp" "*.cl" ) FILE_NAME_ANTI_PATTERNS=( "*.so" "#*" "*~" ) FIND_ARGS=() for d in /home/pvv/?/*/web-docs; do if [[ -d "$d" ]]; then FIND_ARGS+=("$d") fi done if [[ ${#EXCLUDE_LOCATIONS[@]} -gt 0 ]]; then FIND_ARGS+=( \( ) for ((i=0; i<${#EXCLUDE_LOCATIONS[@]}; i++)); do FIND_ARGS+=(-path "${EXCLUDE_LOCATIONS[i]}") if [[ $i -lt $(( ${#EXCLUDE_LOCATIONS[@]} - 1 )) ]]; then FIND_ARGS+=(-o) fi done FIND_ARGS+=( \) -prune -o ) fi FIND_ARGS+=( "-type" "f" "-executable" ) FIND_ARGS+=(\( ) for ((i=0; i<${#FILE_NAME_PATTERNS[@]}; i++)); do pattern="${FILE_NAME_PATTERNS[i]}" if [[ $i -gt 0 ]]; then FIND_ARGS+=(-o) fi FIND_ARGS+=(-name "$pattern") done FIND_ARGS+=(\) ) for anti_pattern in "${FILE_NAME_ANTI_PATTERNS[@]}"; do FIND_ARGS+=(-not -name "$anti_pattern") done FIND_ARGS+=("-print") echo "find \\" for arg in "${FIND_ARGS[@]}"; do echo " '$arg' \\" done echo "" find "${FIND_ARGS[@]}" 2>/dev/null | tee "$OUTPUT_DIR/cgi-paths.txt" else echo "'$OUTPUT_DIR/cgi-paths.txt' already exists, reusing..." fi ########################## # ANALYZE PERL LIBRARIES # ########################## if [[ ! -f "$OUTPUT_DIR/perl-libs.txt" ]]; then echo "Analyzing Perl libraries..." : > "$OUTPUT_DIR/perl-libs.txt" mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" PERL_PATHS=() for p in "${ALL_CGI_PATHS[@]}"; do case "$p" in *.pm|*.pl) PERL_PATHS+=("$p") ;; esac done for p in "${PERL_PATHS[@]}"; do rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/perl-libs.txt" || true done else echo "'$OUTPUT_DIR/perl-libs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/perl-libs.txt" | sort > "$OUTPUT_DIR/perl-libs-tmp1.txt" # Remove import specifiers sed -E \ -e 's|\s*(qw)?\s*\(.*\)||g' \ -e 's|\s*(qw)?\s*\/.*\/||g' \ "$OUTPUT_DIR/perl-libs-tmp1.txt" | sort > "$OUTPUT_DIR/perl-libs-tmp2.txt" # Remove standard Perl modules comm -2 -3 "$OUTPUT_DIR/perl-libs-tmp2.txt" <(cat PERL_STANDARD_MODULES.txt | sort) > "$OUTPUT_DIR/perl-libs-tmp3.txt" # Remove pragmas readarray -t PERL_PRAGMAS < PERL_PRAGMAS.txt remove_pragmas_regex=$(printf '|^%s' "${PERL_PRAGMAS[@]}") remove_pragmas_regex="${remove_pragmas_regex:1}" # remove leading '|' sed -E "/${remove_pragmas_regex}/d" "$OUTPUT_DIR/perl-libs-tmp3.txt" > "$OUTPUT_DIR/perl-libs-filtered.txt" cat "$OUTPUT_DIR/perl-libs-filtered.txt" | uniq -c | sort -gr > "$OUTPUT_DIR/perl-libs-overview.txt" ######################### # ANALYZE PHP LIBRARIES # ######################### if [[ ! -f "$OUTPUT_DIR/php-libs.txt" ]]; then echo "Analyzing PHP libraries..." : > "$OUTPUT_DIR/php-libs.txt" mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" PHP_PATHS=() for p in "${ALL_CGI_PATHS[@]}"; do case "$p" in *.php[0-9]*|*.phtml) PHP_PATHS+=("$p") ;; esac done for p in "${PHP_PATHS[@]}"; do rg '^use ([^;]+);' --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/php-libs.txt" || true done else echo "'$OUTPUT_DIR/php-libs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/php-libs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/php-libs-overview.txt" ######################## # ANALYZE CGI PROGRAMS # ######################## if [[ ! -f "$OUTPUT_DIR/cgi-progs.txt" ]]; then echo "Analyzing CGI programs..." : > "$OUTPUT_DIR/cgi-progs.txt" mapfile -t ALL_CGI_PATHS < "$OUTPUT_DIR/cgi-paths.txt" readarray -t BIN_LOCATIONS < BIN_LOCATIONS.txt bin_locations_regex=$(printf '|%s' "${BIN_LOCATIONS[@]}") bin_locations_regex="${bin_locations_regex:1}" # remove leading '|' bin_locations_regex="(?:${bin_locations_regex})" for p in "${ALL_CGI_PATHS[@]}"; do rg "$bin_locations_regex/(?:env\\s*)?(\\w+(?:/\\w+)*)" --no-messages -N -o -r '$1' "$p" 2>/dev/null >> "$OUTPUT_DIR/cgi-progs.txt" || true done # TODO: extract non-absolute binary references from calls that invoke subprocesses in the various languages else echo "'$OUTPUT_DIR/cgi-progs.txt' already exists, reusing..." fi cat "$OUTPUT_DIR/cgi-progs.txt" | sort | uniq -c | sort -gr > "$OUTPUT_DIR/cgi-progs-overview.txt" ########################## # ANALYZE LISP LIBRARIES # ########################## # TODO find lisp libraries