# check94 Usage: check94 filenames # Purpose: Checks sequences for commonest errors. # If you download this from my web page, it is important to # use the SOURCE FILE. DON'T mouse it off the screen # because many of the characters will not be visible. # The name of this program is check94 (not check94.txt). # With some versions of grep you may have to remove # the backslashes in the lines like # $GREP "^%O" $* | $GREP -v "^%O A[0-9]\{6\} [-0-9]+,[0-9]+$" # The way they are now works fine with gre. # Note to myself: remember to maintain the link # ln check94 /home/njas/wwwfiles/sequences/check94.txt # # Revised Dec 29 1995 Apr 11 1998 July 12 1998 June 10 2000 # Changed July 27 2000 to do more checks # Comments revised Oct 22 2002 # More checks Nov 06 2002 # Checks for @ signs in email addresses, Apr 19, 2003 # # Check program was called correctly if [ "$#" -ne 1 ] then echo "Incorrect number of argts. Usage: check94 file(s) " echo "Performs various checks on sequence files" exit 1 fi # define version of awk to be used AWK=/usr/common/bin/gawk # define which version of grep to use # I prefer Andrew Hume's very fast version, called gre GREP=gre # Check file #1 exists if [ -f $1 ] then : else echo "$1 does not exist in this directory" >& 2 exit 1 fi ############################################# # Start the checks ############################################# # Check for repeated %I lines # An individual sequence can only have one %I line # Furthermore all the sequences must have distinct A-numbers # so all the %I lines in the database must be distinct # If this test fails it means usually that someone # has submitted the same sequence twice # or else that two different sequences have the same A-number # These are serious errors echo "Checking $* for repeated %I lines " $GREP -h "%I" $* | $AWK ' { print $2 } ' | sort -n | uniq -d # Check that the offset lines are %O not %0 ! echo "Checking $* for %0 lines [with 0 instead of O] " $GREP "%0" $* # Check for signs in %STU lines # The %S, %T, %U lines are for the unsigned sequence # with the minus signs omitted # If there are minus signs in the sequence then the keyword # "sign" is used, the signed sequence appears in the %V, %W, %X # lines and the unsigned sequence in the %S, %T, %U lines. # Otherwise, if the sequence is nonnegative, the keyword "nonn" is used # and only %S, %T, %U lines appear echo "Checking for signs in %STU lines" $GREP -h "%[STU]" $* | sed -n '/\-/p' # Because of Perl scripts used by the webmaster, certain # strings involving square brackets may not appear in the database # For example a square bracket may not be followed by a plus sign. # The forbidden strings are # with no spaces here! # | | # v v # [ + and + ] # [ - and - ] # [ ! and ! ] # [ * and * ] # [ # and # ] # You see I cannot even show you the strings that are forbidden without # inserting spaces! # The way to fix this is to insert blanks, as I have done # above. This is very annoying since these strings are very common # in Mathematica echo "Checking for the pairs [ +, + ], [ -, - ], [ !, ! ], [ *, * ], [ #, # ]" $GREP "\[[\+\-\*\^\#\!]" $* $GREP "[\+\-\*\^\#\!]\]" $* # The commonest error of all is to include a line # with the wrong A-number (putting a line %F A012345 in # the entry for A012346, say) echo "Checking $* to see if lines are grouped correctly" cat $* | $AWK ' $1 == "%I" { prev = $2 } NF > 0 && $1 != "%I" && $2 != prev { print "grouping error at ", prev print $0 } ' echo "Checking $* for empty lines" $GREP -h "^%[STUVWXNDHFKOoptAE] A......$" $* # Further basic checks echo "Checking punctuation etc:" # Check that crucial lines are present # For example, every sequence must have %I, %O and %K lines cat $* | $AWK ' # p[%I]=1 if %I line is present, etc; seic=1 if S line "ends in comma" BEGIN { p["%I"] = 0; p["%S"] = 0; p["%T"] = 0; p["%U"] = 0; p["%N"] = 0; p["%R"] = 0; p["%A"] = 0; p["%O"] = 0; p["%K"] = 0; seic = 0; teic = 0; ueic = 0; veic = 0; weic = 0; xeic = 0; id = 0; have = 0; } $1 == "%I" { # start #1 # Check previous sequence was OK if (have == 1) { # start of summary-print section for the previous seq if ( p["%S"] != 1 ) print id, "has no S line"; if ( p["%N"] == 0 ) print id, "has no N line"; if ( p["%O"] != 1 ) print id, "has no O line"; if ( p["%K"] != 1 ) print id, "has no K line"; # Check commas if ( p["%T"] == 1 && seic == 0 ) print id, " missing comma on S line"; if ( p["%T"] != 1 && seic == 1 ) print id, " superfluous comma on S line"; if ( p["%U"] == 1 && teic == 0 ) print id, " missing comma on T line"; if ( p["%U"] != 1 && teic == 1 ) print id, " superfluous comma on T line"; if ( p["%U"] == 1 && ueic == 1 ) print id, " superfluous comma on U line"; if ( p["%T"] == 0 && p["%U"] == 1 ) print id, " U line but not T!"; if ( p["%W"] == 1 && veic == 0 ) print id, " missing comma on V line"; if ( p["%W"] != 1 && veic == 1 ) print id, " superfluous comma on V line"; if ( p["%X"] == 1 && weic == 0 ) print id, " missing comma on W line"; if ( p["%X"] != 1 && weic == 1 ) print id, " superfluous comma on W line"; if ( p["%X"] == 1 && xeic == 1 ) print id, " superfluous comma on X line"; if ( p["%W"] == 0 && p["%X"] == 1 ) print id, " X line but not W!"; } # that ends the summary-print section for the previous seq # reset everything, getting ready to read new sequence p["%I"] = 1; p["%S"] = 0; p["%T"] = 0; p["%U"] = 0; p["%V"] = 0; p["%W"] = 0; p["%X"] = 0; p["%N"] = 0; p["%R"] = 0; p["%A"] = 0; p["%O"] = 0; p["%K"] = 0; id = $2; seic = 0; teic = 0; ueic = 0; veic = 0; weic = 0; xeic = 0; have = 1; } # end #1 # Checks made while reading new seq line by line #$1 ~ /%[STUVWX]/ && /O/ { print id, $1, " contains an Oh" } $1 ~ /%[STUVWX]/ { t1 = $3; gsub("[0-9,-]", "", t1); if (length(t1) > 0) print id, $1, "contains bad character: \"", t1, "\"" } $1 != "%I" { p[$1] = 1; } $1 == "%S" && /,$/ { seic = 1 } $1 == "%T" && /,$/ { teic = 1 } $1 == "%U" && /,$/ { ueic = 1 } $1 == "%V" && /,$/ { veic = 1 } $1 == "%W" && /,$/ { weic = 1 } $1 == "%X" && /,$/ { xeic = 1 } /^[^%]/ { print id, $1, "line", NR, " does not begin with %" } /,,/ { print id, $1, "line", NR, "contains 2 commas" } $1 == "%S" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%T" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%U" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%V" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%W" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%X" && NF > 3 { print id, $1, " Internal blank on line", NR } $1 == "%S" && $3 ~ /\./ { print id, $1, " Period on line", NR } $1 == "%T" && $3 ~ /\./ { print id, $1, " Period on line", NR } $1 == "%U" && $3 ~ /\./ { print id, $1, " Period on line", NR } $1 == "%V" && $3 ~ /\./ { print id, $1, " Period on line", NR } $1 == "%W" && $3 ~ /\./ { print id, $1, " Period on line", NR } $1 == "%X" && $3 ~ /\./ { print id, $1, " Period on line", NR } #$1 == "%N" && !/\.$/ && !/\!$/ { print id, $1, "Line ", NR, "missing period" } #$1 == "%R" && NF > 2 && !/\.$/ { print id, $1, "Line ", NR, "missing period" } #$1 == "%Y" && NF > 2 && !/\.$/ { print id, $1, "Line ", NR, "missing period" } #$1 == "%F" && NF > 2 && !/\.$/ { print id, $1, "Line ", NR, "missing period" } END { # Check last sequence was OK if (have == 1) { # start of summary-print section for the previous seq if ( p["%S"] != 1 ) print id, "has no S line"; if ( p["%N"] == 0 ) print id, "has no N line"; if ( p["%O"] != 1 ) print id, "has no O line"; if ( p["%K"] != 1 ) print id, "has no K line"; # Check commas if ( p["%T"] == 1 && seic == 0 ) print id, " missing comma on S line"; if ( p["%T"] != 1 && seic == 1 ) print id, " superfluous comma on S line"; if ( p["%U"] == 1 && teic == 0 ) print id, " missing comma on T line"; if ( p["%U"] != 1 && teic == 1 ) print id, " superfluous comma on T line"; if ( p["%U"] == 1 && ueic == 1 ) print id, " superfluous comma on U line"; if ( p["%T"] == 0 && p["%U"] == 1 ) print id, " U line but not T!"; if ( p["%W"] == 1 && veic == 0 ) print id, " missing comma on V line"; if ( p["%W"] != 1 && veic == 1 ) print id, " superfluous comma on V line"; if ( p["%X"] == 1 && weic == 0 ) print id, " missing comma on W line"; if ( p["%X"] != 1 && weic == 1 ) print id, " superfluous comma on W line"; if ( p["%X"] == 1 && xeic == 1 ) print id, " superfluous comma on X line"; if ( p["%W"] == 0 && p["%X"] == 1 ) print id, " X line but not W!"; } # that ends the summary-print section for the previous seq } ' # No line should end with a blank echo "Checking for terminating blanks" $GREP " $" $* # # Check for repeated % lines: # This is the slowest part of the tests # echo "Checking for repeated % lines" cat $* | $AWK ' BEGIN { RS = ""; FS = "\n" type[ 2]="%A" type[ 6]="%I" type[ 7]="%K" type[ 8]="%N" type[ 9]="%O" type[12]="%S" type[13]="%T" type[14]="%U" type[15]="%W" # clear all possible types for (i in type) have[type[i]]=0; } # start processing next sequence: { # left 0 for (i = 1; i <= NF; i++) { # left 1 if ($i ~ /^%I/) id = substr( $i, 4, 7 ) for (j in type ) { if ($i ~ type[j]) ++have[type[j]] }; }; # right 1 # see if any repeated lines for (j in type ) { if ( have[type[j]] > 1 ) print( id, " ", type[j], " repeated") }; # clear all possible types for (i in type) have[type[i]]=0; } # right 0 ' # Check %H lines are OK # Checks for common errors in %H lines which would # prevent them being made into links $GREP "^%H" $* | $AWK ' $0 !~ /<[aA] [hH][rR][eE][fF]=\"/ { print "%H line does not contain / { print "%H line does not contain \">"; print $0 } $0 !~ /<\/[aA]>/ { print "%H line does not contain \"\""; print $0 } $0 ~ /\/ { print "%H line contains ampersand-pound (change amp#35; to #):"; print $0 } ' echo "Checking for internal blanks in %H lines" $GREP "^%H" $* | sed 's/^.*="// s/">.*$//' | $GREP " " # Check for %c etc etc # Report an error if any line begins %? where ? is one of # abcdfghijklmnqrsuvwxyzBGJLMPQZ # - these lines are not used at present cat $* | $AWK ' $1 ~ /%[abcdfghijklmnqrsuvwxyzBGJLMPQZ]/ { print "Bad first field:"; print $0 }' # Check for %R $GREP %R $* # Check for control-M characters (the database only uses ascii characters) echo "Checking for control-M characters" $GREP "\^M" $* $GREP " " $* # Check for bad A-numbers # The correct format is Annnnnn where n is in range 0..9 echo "Check for bad A-numbers" $GREP "^%[NCFYAEe] A[0-9]\{6\} .*A[0-9]\{2,5\}[\n\s.,;\-]" $* | sed '/lattice/d' $GREP "^%[NCFYAEe] A[0-9]\{6\} .*A[0-9]\{7,9\}[\n\s.,;\-]" $* | sed '/lattice/d' # Check for bad dates # The correct format for dates is Mon nn YEAR # where Mon is exactly 3 letters, first one capitalized # nn is two digits, e.g. 03 # YEAR is 4 digits, e.g. 2002 echo "Check for bad dates" $GREP "^%[AE].*January" $* $GREP "^%[AE].*February" $* $GREP "^%[AE].*March" $* $GREP "^%[AE].*April" $* $GREP "^%[AE].*June" $* $GREP "^%[AE].*July" $* $GREP "^%[AE].*August" $* | sed '/Augustin/d' $GREP "^%[AE].*September" $* $GREP "^%[AE].*October" $* $GREP "^%[AE].*November" $* $GREP "^%[AE].*December" $* # Check for bad %O or %K lines # The correct format for an offset line is (e.g.) # %O A012345 0,3 # where the 0 means first subscript is 0, # and the 3 means that the first number bigger than 1 in # magnitude is the third from the term, starting the count at 1 # The correct format for a keyword line is (e.g.) # %K A012345 nonn,easy echo "Check for bad %O or %K lines" $GREP "^%O" $* | $GREP -v "^%O A[0-9]\{6\} [-0-9]+,[0-9]+$" $GREP "^%K" $* | $GREP -v "^%K A[0-9]\{6\} [a-z,]+$" # Check for internal blanks cat $* | $AWK ' # lwb means last-was-blank BEGIN {lwb = 0 } { if ( length($0) <= 1 ) {lwb = 1; next} } { Anum = $2 } lwb == 1 && $1 !~ "%I" { print( "internal blank at ", Anum ); lwb = 0; next } lwb == 1 && $1 ~ "%I" { lwb = 0; next } ' # Check for bad "more"'s # The keyword "more" means "more terms are needed" # The program nomore.sh must be downloaded separately echo "Checking for bad more keywords" nomore.sh $* > /dev/null # Check for leading 0's # The terms in the sequences must be ordinary decimal # integers, such as 0, 1, -1, -202, but not 015, not 1.0e^14 echo "Checking for leading 0s" sed -n '/^%[STUVWX]/p' $* | sed -n '/[ ,]0[1-9]/p' # Check for obsolete keywords echo "Checking obsolete keywords, unnecessary commas, bad %D %H lines, @ signs, etc." sed -n '/^%K .*huge/p /^%K .*done/p /^%K .*look/p /^%K .*part/p /[Hh]ence,/p /[Tt]hus,/p /[Tt]herefore,/p /^%D .*[Hh][Rr][Ee][Ff]/p' $* sed -n '/^%H/p' $* | sed '/[Hh][Rr][Ee][Ff]/d' # check for links in non-%H lines echo "checking for links in non-%H lines" gre "a href" $* | gre -v %H >out66 # check no of blank lines = no of seqs t1=`gre '^$' $* | wc -l` #echo $t1 t2=`gre '%I' $* | wc -l` #echo $t2 if test $t1 -ne $t2 then echo "There are $t1 blank lines but $t2 sequences" fi # end of tests echo "You should also run check_big"