; XEmacs: This file contains -*- Scheme -*- source code. ;;; Nematode DNA ;;; John David Stone ;;; Department of Mathematics and Computer Science ;;; Grinnell College ;;; stone@cs.grinnell.edu ;;; created April 7, 2000 ;;; last revised May 17, 2000 ;;; Living cells sustain and regulate themselves by producing various ;;; proteins, assembling them from smaller chemical units (amino acids) ;;; according to instructions that are encoded in the structure of ;;; molecules of deoxyribonucleic acid (DNA). ;;; A DNA molecule is composed of a large number of ``bases,'' submolecules ;;; of four types: adenine, cytosine, guanine, and thymine. The general ;;; shape of the molecule is like a ladder, one end of which has been ;;; repeatedly twisted, so that each of the uprights is a helix. Each of ;;; the rungs of the ladder consists of a pair of bases, either with ;;; adenine at one end and thymine at the other, or with cytosine at one ;;; end and guanine at the other. Any base can occur at either end of a ;;; rung. ;;; Various organizations have compiled information about the sequence of ;;; bases in the DNA of several organisms and stored it in files that are ;;; available on the Internet. The file format used by the C. elegans ;;; Genome Project at The Sanger Center (http://www.sanger.ac.uk/), which ;;; is compiling the genome of the nematode _Caenorhabditis elegans_, ;;; consists of a one-line label identifying one of the nematode's ;;; chromosomes, followed by an arbitrary number of lines containing the ;;; sequence of bases along one strand of the DNA in that chromosome, fifty ;;; bases to a line (except that the last line may contain fewer than fifty ;;; bases). Each base is represented by the initial letter of its name ;;; (#\a, #\c, #\g, or #\t). At a few positions in the sequence, the base ;;; has not yet been determined. If a number of undetermined bases occur in ;;; a row, each one is represented by a hyphen character, #\-. An isolated ;;; undetermined base is represented by the character #\n. (define known-bases (list #\a #\c #\g #\t)) (define bases-or-substitutes (append known-bases (list #\n #\-))) ;;; Given an already-opened input port, the TALLY-BASES procedure ;;; determines the number of known occurrences of each of the four bases in ;;; the base sequence that can be read in through the port. It returns an ;;; association list in which the keys are characters designating bases and ;;; the values are tallies of known occurrences. Characters that do not ;;; designate bases are ignored. (define tally-bases (lambda (source) (let kernel ((tallies null) (next (read-known-base source))) (if (eof-object? next) tallies (kernel (bump next tallies) (read-known-base source)))))) ;;; Given a unary predicate applying to characters, the READER procedure ;;; constructs and returns a procedure that reads in characters through ;;; a specified input port until it finds one that satisfies the predicate ;;; and returns it, or else encounters the end-of-file object and returns ;;; that. (define reader (lambda (test) (lambda (source) (let kernel ((ch (read-char source))) (if (or (eof-object? ch) (test ch)) ch (kernel (read-char source))))))) ;;; The READ-KNOWN-BASE procedure reads in characters through a given input ;;; port until it finds one that represents a known base and returns it, or ;;; encounters the end-of-file object and returns that. (define read-known-base (reader (lambda (ch) (memv ch known-bases)))) ;;; The READ-BASE procedure reads in characters through a given input ;;; port until it finds one that represents a base (or marks the position ;;; of an unknown base) and returns it, or encounters the end-of-file ;;; object and returns that. (define read-base (reader (lambda (ch) (memv ch bases-or-substitutes)))) ;;; The BUMP procedure takes two arguments, of which the second must be an ;;; association list in which the data are numbers. It returns an ;;; association list with the same keys and the same associated data, ;;; except that the datum associated with the value that is the first ;;; argument to BUMP has been increased by 1. If the first argument is not ;;; a key of the original association list, an entry for that value is ;;; added, with 1 as the associated value. (define bump (lambda (key als) (let kernel ((rest als)) (cond ((null? rest) (list (cons key 1))) ((eq? key (caar rest)) (cons (cons key (+ (cdar rest) 1)) (cdr rest))) (else (cons (car rest) (kernel (cdr rest)))))))) ;;; The directions for constructing proteins are encoded in the sequence of ;;; bases attached to one of the two uprights of the DNA ladder. (The ;;; sequence of bases on the other upright contains the same information, ;;; encoded in ``complementary'' form, like a photographic negative; cells ;;; use this complementary encoding in the process of duplicating the ;;; instructions and transporting them to a cellular workshop for protein ;;; construction.) ;;; Each group of three adjacent bases along one upright of a DNA molecule ;;; is a _codon_. A codon encodes the instruction to place some particular ;;; amino acid at a position in the protein that corresponds to the ;;; position of that codon. The mapping from codons to amino acids (the ;;; _genetic code_) is constant. A sequence of bases that directs the ;;; construction of a protein typically begins with the three-base ``start ;;; codon'' atg and ends with one of the ``stop codons'' taa, tag, and ;;; tga. ;;; The FIND-START-CODON procedure reads in bases through a given port ;;; until it finds three successive bases that form a start codon (in which ;;; case it returns the number of bases preceding the first base of the ;;; start codon) or encounters the eof-object (in which case it returns ;;; #F). (define find-start-codon (lambda (source) (let ((first-in-file (read-base source))) (if (eof-object? first-in-file) #f (let ((second-in-file (read-base source))) (if (eof-object? second-in-file) #f (let ((third-in-file (read-char source))) (if (eof-object? third-in-file) #f (let kernel ((first first-in-file) (second second-in-file) (third third-in-file) (bases-passed 0)) (if (start-codon? (string first second third)) bases-passed (let ((next (read-base source))) (if (eof-object? next) #f (kernel second third next (+ bases-passed 1)))))))))))))) ;;; The START-CODON? predicate determines whether a given string of three ;;; bases is a start codon. (define start-codon? (lambda (str) (string=? str "atg"))) ;;; The FIND-STOP-CODON procedure reads in one codon (three bases) at a ;;; time, from an already-open input port, until it finds a stop codon (in ;;; which case it returns the number of codons read) or encounters the ;;; eof-object (in which case it returns #F). (define find-stop-codon (lambda (source) (let kernel ((codons-read 0) (next (read-codon source))) (cond ((eof-object? next) #f) ((stop-codon? next) (+ codons-read 1)) (else (kernel (+ codons-read 1) (read-codon source))))))) ;;; The READ-CODON procedure attempts to read in three bases through an ;;; already open input port. If it succeeds, it returns the three bases as ;;; a string; if not, it returns the end-of-file object. (define read-codon (lambda (source) (let ((first (read-base source))) (if (eof-object? first) first (let ((second (read-base source))) (if (eof-object? second) second (let ((third (read-base source))) (if (eof-object? third) third (string first second third))))))))) ;;; The STOP-CODON? predicate determines whether a given codon is a stop ;;; codon. (define stop-codon? (lambda (str) (or (string=? str "taa") (string=? str "tag") (string=? str "tga")))) ;;; Part 1: Determine the number of known occurrences of each of the bases ;;; a, c, g, and t in chromosome II of the nematode C. elegans. ;;; Give the data file a name. (define nematode-chromosome-II-file-name "/home/stone/courses/scheme/data/CHROMOSOME_II.dna") ;;; The DISCARD-LINE procedure reads in characters through a given input ;;; port until it encounters either a newline character or the end-of-file ;;; object. Its return value is unspecified; DISCARD-LINE should be called ;;; only for its side effect. (define discard-line (lambda (source) (let kernel ((next (read-char source))) (if (not (or (eof-object? next) (char=? next #\newline))) (kernel (read-char source)))))) ;;; The DISPLAY-LINE procedure takes any number of arguments and writes out ;;; each one, adding a newline character at the end. (define display-line (lambda scribends (for-each display scribends) (newline))) ;;; The DISPLAY-TALLIES procedure takes as its argument an association list ;;; in which each key is one of the four bases and the argument is an ;;; integer and writes out a small table showing the key-value pairs. (define display-tallies (lambda (tallies) (if (null? tallies) (display-line "No bases were found.") (for-each (lambda (pair) (display-line (car pair) ": " (cdr pair))) tallies)))) ;;; Open the data file, discard the file header, compute and write out the ;;; tallies, and close the port. (let ((nematode-chromosome-II (open-input-file nematode-chromosome-II-file-name))) (discard-line nematode-chromosome-II) (let ((tallies (tally-bases nematode-chromosome-II))) (display-tallies tallies)) (close-input-port nematode-chromosome-II)) ;;; Part 2: Determine how many bases precede the first occurrence of the ;;; start codon in the chromosome II sequence and how many codons there are ;;; after this start codon up to, and including, a stop codon. ;;; Open the data file, discard the file header, advance to the start ;;; codon, display the number of bases passed, advance to the stop codon, ;;; display the number of codons read, and close the port. (let ((nematode-chromosome-II (open-input-file nematode-chromosome-II-file-name))) (discard-line nematode-chromosome-II) (let ((bases-passed (find-start-codon nematode-chromosome-II))) (if (eq? bases-passed #f) (display-line "No start codon was found.") (begin (display-line bases-passed " " (if (= bases-passed 1) "base" "bases") " preceded the first start codon.") (let ((codons-read (find-stop-codon nematode-chromosome-II))) (if (eq? codons-read #f) (display-line "No stop codon was found.") (begin (display-line codons-read " " (if (= codons-read 1) "codon" "codons") " followed the start codon, up to " "and including the first subsequent " "stop codon."))))))) (close-input-port nematode-chromosome-II)) ;;; Here are the results: ;;; c: 2720612 ;;; t: 4784354 ;;; a: 4795288 ;;; g: 2715249 ;;; 209 bases preceded the first start codon. ;;; 7 codons followed the start codon, up to and including the first subsequent stop codon.