# jeeves.sh # If you download this from my web page, it is important to # use the SOURCE FILE. DON'T mouse it off the screen # because many of the characters will not be visible. # Note to myself: remember to maintain the link # ln jeeves.sh /usr/njas/wwwfiles/sequences/jeeves.sh.txt # Call this file jeeves.sh (not jeeves.sh.txt) # Most users should delete the last 10 lines before running it # - they are used for renumbering sequences. # Processes new sequences from emails # Usage is: jeeves.sh file 1 >out # where file1 contains a bunch of emails with # comments, extensions, new sequences, etc # and puts the entries into standard format. # When called with the -c option, it changes all the lines like # %A A012345 Jack Smith, Dec 01, 2002 # to # %E A012345 More terms from Jack Smith, Dec 01, 2002 # which is the mostly likely thing that's needed # with comments and extensions. # The main output is placed in a new file called junk8880 # Files junk8879 etc are temp files # The program is not perfect, because it is hard to tell # where one sequence ends and the next begins. # As a result there will be some lines in the output # that need to be cleaned up. # # A typical example is this. You will see a line that says something like # %A A076668 Zakir F. Seidov (zfseidov@ycariel.yosh.ac.il), Oct 25 2002 From njas@research.att.com Tue Oct 29 21:35:38 2002 Received: from mail-green.research... [often a very long line] # # which needs to be replaced by # # %A A076668 Zakir F. Seidov (zfseidov@ycariel.yosh.ac.il), Oct 25 2002 # # I simply look for the string " From " and do "delete to end of line" # for each such line (if appropriate). # The program basically looks for lines that begin with % and formats # them nicely. # It will accept continuation lines in many formats # On the other hand it has a tendency to ignore lines # that begin *% # Jun 09 2004 Change "'th" to "-th" # Jun 09 2004 Omit semicolons from sequence lines # Apr 19 2003 Conceal @ signs in email addresses # Apr 04 2003 Improved way end of message is detected # Apr 04 2003 Improved way example lines are handled # Dec 10 2002 Two tiny adjustments to lengths of output lines # Oct 30 2002, improved comments lines # Added -c option to change %A to %E etc, Dec 20 2001 # Dec 13 2001, uses addCf.sh to add Cf. # Feb 05 2001, allowing for possibility that both %STU and %VWX lines are # present at same time # Feb 05 2001 cont., need to clear old entries better before reading new entry # Modified July 21 2000 to allow for more blank lines inside fields # Apr 12 1998, May 12 1998 # set files and directories AWK=/bin/gawk # Set a default value comment=FALSE # Was it called with -c option? while getopts c option do case "$option" in c) comment=TRUE;; \?) echo "Usage: jeeves.sh [-c] fooNEW >out" >& 2 echo " -c means these are comments and extensions" >& 2 exit 1;; esac done # Make sure an input file was specified if [ "$OPTIND" -gt "$#" ] then echo "Missing file name. Usage: jeeves.sh [-c] fooNEW >out" >& 2 echo " -c means these are comments and extensions" >& 2 exit 2 fi # Get file name with input data shiftcount=`expr $OPTIND - 1` shift $shiftcount file1=$1 # Check file exists if [ -f $file1 ] then : else echo "$file1 does not exist in this directory" >& 2 exit 1 fi echo "Unrenumbered sequences in junk8880" >& 2 # delete trailing blanks then # compress multiple blanks or tabs to a single blank # change any control-M's to spaces # Nov 22, 2001, change > and < back from html # Apr 10 2002: use perl to change nonascii characters echo "" >>$file1 cat $file1 | sed 's/ *$// s/ / /g s/<//g s/[ ][ ]*/ /g' >junk8868 # Some perl stuff not currently used #perl -p -e '$_ =~ s/\205/.../g; # s/\222/,/g; # s/\223/"/g; # s/\224/"/g; # s/\226/-/g;' # FIRST awk RUN: THE INITIAL PROCESSING # Delete any internal blank lines cat junk8868 | $AWK ' BEGIN { inseq = 0 } # print a blank line before printing a %I line # Beware: any % lines that happen before the first %I line are lost /^%I/ { print ""; print; inseq = 1; next} # Flag the end of the entry /^RH/ { inseq = 0; next } /^Return-Path/ { inseq = 0; next } /^Return-path/ { inseq = 0; next } /^RETURN-PATH/ { inseq = 0; next } /^#############/ { inseq = 0; next } /^-------------/ { inseq = 0; next } $1 ~ /^From/ && $2 ~ /@/ { inseq = 0; next } # Ignore any blank lines /^$/ { next } # Print any % lines /^%/ { print; next} # print any internal lines (i.e. those in the middle of an entry that do not # start with a %) inseq == 1 {print; next } # otherwise skip line { next }' >junk8869 # SECOND awk RUN # amalgamate any internal non-% lines with line before cat junk8869 | $AWK ' BEGIN { space = " "; inseq = 0; line = "" } # new %I, print old line if any, start new entry, print %I line /^%I / { if (length(line) > 1 ) { print line line = "" print "" } inseq = 1; line = ""; Anum = $2 # zero out any old stuff that might be lying around # print the new start line print next } # new % line, print old line if any, start building current line /^%/ { if ( length(line) > 1 ) print line type = $1 sub("%", "", type) line = $0; next } # non-% line, amalagamate with old line # unless it is an example line, in which case # prefix it with %e and A-number /^[^%]/ && inseq == 1 && type !~ /e/ { line = line space $0; next } /^[^%]/ && inseq == 1 && type ~ /e/ { if ( length(line) > 1 ) { print line line = "" } print "%e" space Anum space $0 line = "" next } # a blank line here means end of sequence /^$/ && inseq == 1 { print line; line = ""; inseq = 0; print ""; next } # not sure if we ever get to here: { print} # end - print remaining line if any END { if (length(line) > 1 ) { print line print ""} } ' >junk8870 # THIRD awk RUN, PROCESSES %STUVWX LINES # amalgamate any %T %U lines with %S line cat junk8870 | $AWK ' /^%[STU]/ { Anum = $2; # drop 2nd field n=split($0,a,"[,. \t][,. \t]*"); for(i=3; i<=n; i++) {seq = seq" "a[i];} trig = 1; next } /^%/ { if ( trig == 1) { printf("%%S %s %s\n", Anum, seq ) print; trig = 0; seq = "" next } else {print ; next } } { print } ' >junk8879a # amalgamate any %W %X lines with %V line cat junk8879a | $AWK ' /^%[VWX]/ { Anum = $2; # drop 2nd field n=split($0,a,"[,. \t][,. \t]*"); for(i=3; i<=n; i++) {seq = seq" "a[i];} trig = 1; next } /^%/ { if ( trig == 1) { printf("%%V %s %s\n", Anum, seq ) print; trig = 0; seq = "" next } else {print ; next } } { print } ' >junk8879b # 4TH awk RUN # if any signs in %S line, fix! cat junk8879b | $AWK ' /^%S/ && $0 ~ /-/ { t1 = $0 t2 = $0 gsub("-", "", t1 ) gsub("S", "V", t2 ) print t1 print t2 next } { print} ' >junk8879 echo "processing $file1, breaking up seq lines " >& 2 # 5TH awk RUN, BREAKS UP SEQ LINES ETC AND PRODUCES FINAL OUTPUT cat junk8879 | $AWK ' # start of 5th awk run BEGIN { signed = 0 } # look for next %S line /^%S/ { #L1 # drop the %S A...... stuff Anum = $2 sub("^...........", "" ) printf("%%S %s ",Anum) line = 1 # process it - first split into array "a" n=split($0,a,"[,. \t][,. \t]*"); # tot is total no of seq characters printed so far on current line tot=0; # print the next number in the sequence for(i=1; i<=n; i++) { # skip any empty entries if(length(a[i])==0)continue; # if enough, print a comma, perhaps, and a newline # Dec 10 2002 I changed 70 to 71 in next line, because some edited sequences # were being curtailed if(length(a[i])+1+tot>71 && i > 1) { # patch 4/30/98 to attempt to cure comma at end of %U line if ( line != 3 ) printf(",\n"); else printf("\n"); line = line + 1 if ( line == 2 ) printf("%%T %s ",Anum) else if ( line >= 4) next else printf("%%U %s ",Anum) tot=0; } # now we print the number if(tot)printf(","); printf("%s",a[i]); tot+=length(a[i])+1; } # done with that sequence %S line - terminate current line? if(tot)printf("\n"); next; } #R1 # find next signed sequence line /^%V/ { #Ls signed = 1 # drop the %V A...... stuff Anum = $2 sub("^...........", "" ) printf("%%V %s ",Anum) line = 1 # process it - first split into array "a" n=split($0,a,"[,. \t][,. \t]*"); # tot is total no of seq characters printed so far on current line tot=0; # print the next number in the sequence for(i=1; i<=n; i++) { # skip any empty entries if(length(a[i])==0)continue; # if enough, print a comma, perhaps, and a newline # Dec 10 2002 I changed 85 to 86 in next line, because some edited sequences # were being curtailed if(length(a[i])+1+tot>86 && i > 1) { # patch 4/30/98 to attempt to cure comma at end of %X line if ( line != 3 ) printf(",\n"); else printf("\n"); line = line + 1 if ( line == 2 ) printf("%%W %s ",Anum) else if ( line >= 4) next else printf("%%X %s ",Anum) tot=0; } # now we print the number if(tot)printf(","); printf("%s",a[i]); tot+=length(a[i])+1; } # done with that sequence %V line - terminate current line? if(tot)printf("\n"); next; } #Rs # copy %I lines /^%I/ { printf("\n"); print; # zero out any old stuff lying around Anum = $2 i = 0 off1="" next } # handle %K lines /^%K/ { sub("^,", "", $3); sub(",$", "", $3); print; next } # handle %O line /^%O/ { #L2 # get old offset $3 if any if (length($3) >0 ) off1 = $3 else off1 = 1 nnn = split( off1, oo, "," ) if (nnn == 1) { #L2a # do we have a sequence to get offset from? if (i > 0) { # get true offset # get offset (seq used is a[1] ... a[ilast] ) ilast = i # i will be true offset [k = 0 indicates all terms <= 1] k = 0 ireal = 0 for ( i = 1; i <= ilast; i++) { if (length(a[i]) > 0 ) ireal++ if ( a[i] > 1 ) { k = 1; break} } if ( k == 0 ) ireal = 1 } else ireal = 2 off1 = off1","ireal } #R2a printf("%%O %s %s\n", Anum, off1); next } #R2 # add period to %N line if necessary /^%N/ && /[^\\.]$/ { sub("$", "."); print; next } /^%N/ { print; next } # any other line, just copy unless empty /^%/ { gsub(" A123456", ""); if (length($0) > 11) print; next } #{ print } # end of 5th awk run ' | sed '1d /%Y/s/,/, /g /%Y/s/;/; /g /%Y/s/ */ /g /%Y/s/$/./ /%Y.*\.\.$/s/\.\.$/./ $a\ ' | addCf.sh | sed 's/\[#/[ #/g s/#\]/# ]/g s/\[\*/[ */g s/\*\]/* ]/g s/\[+/[ +/g s/+\]/+ ]/g s/\[-/[ -/g s/-\]/- ]/g s/\[\!/[ !/g s/\!\]/! ]/g' | # disguise at signs: @ becomes (AT) in lines of type %[ACDEFHNYe] # Added Jun 09 2004: omit any semicolons from sequence lines # Also change "'th" to "-th" sed '/^%[ACDEFHNYe]/s/@/(AT)/g' | sed '/^%[STUVWX]/s/;//g' | $AWK ' { gsub("\047th", "-th"); print } ' > junk88800 # are these comments and extensions? if [ "$comment" = TRUE ] then cat junk88800 | sed 's/^%A/%E/' | sed '/^%E/s/[0-9] /&More terms from /' >junk8880 else cp junk88800 junk8880 fi # At this point all the edited sequences are in the file junk8880 # Most associate editors will stop at this point # The rest of this file is mostly for use by the editor-in-chief # This next section takes all the sequence entries # and renumbers them, using the list of available numbers in the file # "next" and using two further shell programs, renum.sh and renum2.sh echo "Processing $file1, renumbering consecutively " >& 2 renum.sh junk8880 12345 > junk8881 echo "Processing $file1, taking new numbers from the file called next " >& 2 renum2.sh junk8881 # last line