#!/bin/sh
# Locate invalid characters in HTML mark-up.
# /www/public_html/admin/janitor/charset.sh
IN_FILE=$1
CHARSET_TEXT=/www/public_html/admin/janitor/charset.txt
CHARSET_LIST=/www/public_html/admin/janitor/charset.lst
# Updated once a day by root's cron job.
HTML_LIST=/tmp/htm__www_public_html__html.lst
if [ ! -f $CHARSET_TEXT ]; then
echo "File not found: $CHARSET_TEXT";
exit;
fi
if [ ! -f $HTML_LIST ]; then
echo "File not found: $HTML_LIST";
exit;
fi
# Use this .lst (list) file to locate invalid characters in mark-up:
#if [ $HTML_LIST -ot $CHARSET_LIST ]; then
if [ $CHARSET_LIST -ot $HTML_LIST ]; then
od -t a $CHARSET_TEXT | fmt -w 3 | grep -v [0-9][0-9] | sort | uniq >$CHARSET_LIST
fi
# Does input file exist?
if [ ! -f $IN_FILE ]; then
echo "File not found: $IN_FILE";
exit;
fi
# Locate invalid characters.
od -t a $IN_FILE | \
awk ' \
BEGIN { \
} \
NR==1 { \
while ( (getline line 0) { CHARSET[line]++; } \
} \
{ \
for (i=(1+1); i<=NF; i++) { \
char = $(i); \
if (! (char in CHARSET)) { \
i = NF+1; \
print IN_FILE ": " $0; \
} \
} \
} \
END { \
}' CHARSET_LIST=$CHARSET_LIST IN_FILE=$IN_FILE
# List of invalid characters:
# fmt -w 1 /tmp/htm__www_public_html__html.invalid_chars | grep -v ^/www | grep -v ^[0-9][0-9] | sort | uniq | join -j 1 -v 1 - charset.lst
###
#