Solution to exercise #5

{ This program is a solution to exercise #5 for CSC 206, ``Fundamentals of
  computer science II,'' offered at Grinnell College in fall semester,
  1996.

  It performs automatic indexing on a text that has been divided into
  pages, recording the positions of marked words and phrases in the source
  text and producing an alphabetized index of them.

  Specifically, the text to be indexed is read from standard input; it is
  assumed to begin with page 1, and each occurrence of the ASCII form-feed
  character is assumed to mark the end of a page and possibly the beginning
  of a new one.  Each name, word, or phrase that is to be recorded in the
  index must be marked in the input with the ASCII commerical-at character,
  @.  If the character immediately following the commercial-at is a
  left-parenthesis, then the text of the index entry is the string of
  characters between that left-parenthesis and the next following
  right-parenthesis. If the character after the commercial-at is not a
  left-parenthesis, then the index entry is the string of characters
  between the commercial-at and the next following space character or end
  of a line of text.

  However, it is an error (1) for the text of an index entry to include a
  commercial-at character, a left-parenthesis, or a form-feed character;
  (2) for the source text to contain a commercial-at character followed by
  a left-parenthesis, if there is no subsequent right-parenthesis; (3) for
  the source text to contain a commercial-at character not followed by a
  left-parenthesis if there is no subsequent space or end of line; or (4)
  for the index entry to be the null string (i.e., for the commercial-at
  character to be immediately followed  by a space or the end of the line,
  or for the commercial-at to be followed immediately by a left-parenthesis
  and then an immediate right-parenthesis).

  If the program encounters any of these errors, it produces only a list of
  syntax errors in the source, giving the page and line number at which
  each error is detected.

  On the other hand, if the input is free from these errors, the program
  produces an alphabetized list of the index entries it encounters, one per
  line, each followed by a list of the page numbers on which the entry
  occurs.  The entries are alphabetized without regard to case
  (specifically, as if all were entirely in upper case).  Non-letter
  characters in the index entries are arranged according to their ASCII
  values.

  Programmer: John Stone, Grinnell College.
  Date of this version: November 17-24, 1996.
}

$heap_dispose on$
$assert_halt on$

program Indexer (Input, Output, StdErr);

$search 'strings.o'$
import
  Strings;

const

  { A code number is provided for every kind of internal programming error
    that the program checks for. }

  FirstExceptionCode = 1;

  CreateSingletonPageNumberListException = 1;
  UndefinedPageNumberListException = 2;
  NilTailInPageNumberListException = 3;
  AppendToPageNumberListException = 4;
  InvalidPageNumberListArgumentException = 5;
  ExceptionException = 6;

  LastExceptionCode = 6;

  Debug = True;
    { True during debugging, False (for greater speed) when the program
      is released }

type
  PageNumber = 1 .. MaxInt;
    { A page number must be a positive integer. }

  PageNumberLink = ^PageNumberComponent;
  PageNumberComponent = record
                          Datum: PageNumber;
                          Next: PageNumberLink
                        end;
  PageNumberList = record
                     Head: PageNumberLink;
                     Tail: PageNumberLink
                   end;
    { The numbers of the pages on which a given index entry appears are
      maintained in a list -- not, strictly speaking, a queue, because
      we need the operation of peeking at the last element added. }

  Entry = record
            EntryText: String;
            Locations: PageNumberList
          end;
    { A component of the index consists of the text of the entry (a string
      of characters) and a list of the page numbers on which it occurs. }

  EntryLink = ^EntryComponent;
  EntryComponent = record
                     Datum: Entry;
                     Next: EntryLink
                   end;
  EntryList = EntryLink;
    { The complete list of index entries is kept as a singly-linked list,
      maintained at all times in alphabetical order (case-insensitive, as
      prescribed by the problem specification. }
    
var
  Index: EntryList;
    { a list containing all of the index entries, with the page numbers on
      which each one occurs }
  ErrorFound: Boolean;
    { indicates whether any syntax errors are encountered during the
      construction of the index }

  { The IndexerExceptionHandler reports an internal programming error and
    halts the program.  All such errors are of the ``this can't happen''
    sort -- the programmer believes that this procedure will never be
    invoked. }

  procedure IndexerExceptionHandler (ExceptionCode: Integer);
  begin
    if (ExceptionCode < FirstExceptionCode) or
                        (LastExceptionCode < ExceptionCode) then
      ExceptionCode := ExceptionException;
    WriteLn (StdErr, 'Exception #', ExceptionCode : 1,
             ' in program Indexer:'); 
    case ExceptionCode of
    CreateSingletonPageNumberListException:
      WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
               'returned by the CreateSingletonPageNumberList function.');
    UndefinedPageNumberListException:
      WriteLn (StdErr, 'An undefined PageNumberList variable was given ',
               'as argument to a procedure.');
    NilTailInPageNumberListException:
      WriteLn (StdErr, 'A nil pointer was encountered in the Tail field ',
               'of a PageNumberList.');
    AppendToPageNumberListException:
      WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
               'returned by the AppendToPageNumberList procedure.');
    InvalidPageNumberListArgumentException:
      WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
               'provided as an argument to a procedure.');
    ExceptionException:
      WriteLn (StdErr, 'The IndexerExceptionHandler procedure received ',
               'an unknown exception code.')
    end
  end;

  { The next few procedures and functions define the operations on the
    PageNumberList type.  In this program, there's never a need for an
    empty list of page numbers, so a page number list is created with
    its first element already in place, and this element is never
    removed; consequently, many of the procedures have the precondition
    that the list is not empty. }

  { The ValidPageNumberList function determines whether a PageNumberList
    is correctly allocated and linked. }

  function ValidPageNumberList (Given: PageNumberList): Boolean;
  var
    Traverser: PageNumberLink;
      { points successively to each component of the linked list in the
        supposed PageNumberList structure }
  begin
    if Given.Head = nil then
      ValidPageNumberList := False
    else begin
      Traverser := Given.Head;
      while Traverser^.Next <> nil do
        Traverser := Traverser^.Next;
      ValidPageNumberList := (Traverser = Given.Tail)
    end
  end;

  { The CreateSingletonPageNumberList function constructs and returns a
    list of page numbers containing only one page number -- the one
    specified by the argument. }

  function CreateSingletonPageNumberList (NewPage: PageNumber):
    PageNumberList;
  var
    Result: PageNumberList;
      { the page number list to be returned, as it is being constructed }
  begin
    New (Result.Head);
    Result.Head^.Datum := NewPage;
    Result.Head^.Next := nil;
    Result.Tail := Result.Head;
    if Debug then
      Assert (ValidPageNumberList (Result),
              CreateSingletonPageNumberListException,
              IndexerExceptionHandler);
    CreateSingletonPageNumberList := Result
  end;

  { The AppendToPageNumberList procedure extends a given page number list
    to include an additional page number at the end, provided that that
    page number is not already at the end of the list.  (If it is, the page
    number list is left unchanged. }

  procedure AppendToPageNumberList (var Locations: PageNumberList;
    NewPage: PageNumber);
  var
    Appendix: PageNumberLink;
      { a pointer to a newly allocated component, to be attached at the
        end of the list }
  begin
    Assert (Locations.Tail <> nil, NilTailInPageNumberListException,
            IndexerExceptionHandler);
    if Locations.Tail^.Datum <> NewPage then begin
      New (Appendix);
      Appendix^.Datum := NewPage;
      Appendix^.Next := nil;
      Locations.Tail^.Next := Appendix;
      Locations.Tail := Appendix
    end;
    if Debug then
      Assert (ValidPageNumberList (Locations),
              AppendToPageNumberListException,
              IndexerExceptionHandler);
  end;

  { The WritePageNumberList procedure writes, to a specified output file,
    all of the page numbers in the list, with a comma and a space before
    each one except the first. }

  procedure WritePageNumberList (var Target: Text; 
    Scribend: PageNumberList);
  const
    Separator = ', ';
      { the string to be used to separate page numbers in the printed
        list } 
  var
    Traverser: PageNumberLink;
      { points to each successive component of Scribend }
  begin
    Assert (ValidPageNumberList (Scribend),
            InvalidPageNumberListArgumentException,
            IndexerExceptionHandler);
    Traverser := Scribend.Head;
    Write (Target, Traverser^.Datum : 1);
    Traverser := Traverser^.Next;
    while Traverser <> nil do begin
      Write (Target, Separator, Traverser^.Datum : 1);
      Traverser := Traverser^.Next
    end
  end;

  { The DeallocatePageNumberList procedure recycles all the dynamically
    allocated storage associated with a PageNumberList. }

  procedure DeallocatePageNumberList (var Delend: PageNumberList);
  var
    Traverser: PageNumberLink;
      { points successively to each component of the PageNumberList to be
        deleted }
    Temporary: PageNumberLink;
      { a pointer to a component that is about to be deleted, immediately
        after Traverser has moved on }
  begin
    Assert (ValidPageNumberList (Delend),
            InvalidPageNumberListArgumentException,
            IndexerExceptionHandler);
    Traverser := Delend.Head;
    while Traverser <> nil do begin
      Temporary := Traverser;
      Traverser := Traverser^.Next;
      Dispose (Temporary)
    end;
    Delend.Head := nil;
    Delend.Tail := nil
  end;

  { The next few procedures and functions deal with lists of entries. }

  { The CreateEntryList procedure constructs and returns an empty
    EntryList. }

  function CreateEntryList: EntryList;
  begin
    CreateEntryList := nil
  end;

  { The AddToEntryList procedure looks up a given string in a given list of
    entries.  If the string is already present, a given page number is
    added to the list of page numbers in the entry for that string; if not,
    a new entry is constructed and inserted at the appropriate point, with
    the given page number as the only item in its list of page numbers. }

  procedure AddToEntryList (NewEntryText: String; NewPage: PageNumber;
    var Index: EntryList);
  var
    Traverser, Trailer: EntryLink;
      { pointers to successive components of Index; Trailer consistently
        points to the component preceding the one Traverser points to }
    Continue: Boolean;
      { indicates whether the search for the given string can and should
        continue } 

    { The BuildNewEntry procedure constructs a new component at a specified
      site, which is presumed to be inside an EntryList, and links it
      to the following element of that EntryList, if any. }

    procedure BuildNewEntry (var Site: EntryLink; NewEntryText: String;
      NewPage: PageNumber);
    var
      Successor: EntryLink;
        { temporary storage for a pointer to the component (if any) at the
          insertion site, which will become the successor of the inserted
          component }
    begin
      Successor := Site;
      New (Site);
      Site^.Datum.EntryText := NewEntryText;
      Site^.Datum.Locations := CreateSingletonPageNumberList (NewPage);
      Site^.Next := Successor
    end;

  begin { procedure AddToEntryList }

    Traverser := Index;
    Continue := True;
    while Continue do

      { If the end of the list of entries is reached, add a new entry
        at the end. }

      if Traverser = nil then begin
        if Traverser = Index then
          BuildNewEntry (Index, NewEntryText, NewPage)
        else
          BuildNewEntry (Trailer^.Next, NewEntryText, NewPage);
        Continue := False
      end

      { If Traverser is pointing to a string that is alphabetically
        prior to the given string, advance Traverser and Trailer (and
        continue the search). }

      else if CaseInsensitivePrecedesString (Traverser^.Datum.EntryText,
                                             NewEntryText) then begin
        Trailer := Traverser;
        Traverser := Traverser^.Next;
      end

      { If Traverser is pointing to a string that is alphabetically
        posterior to the given string, stop and insert a new entry;
        the given string is not in the list of entries and the correct
        insertion point has been reached. }

      else if CaseInsensitivePrecedesString (NewEntryText,
                                Traverser^.Datum.EntryText) then begin
        if Traverser = Index then
          BuildNewEntry (Index, NewEntryText, NewPage)
        else
          BuildNewEntry (Trailer^.Next, NewEntryText, NewPage);
        Continue := False
      end

      { Otherwise, Traverser is pointing to a string that matches the
        given string; add the new page number to the list of page numbers
        in the entry we've found.   The new copy of the string can be
        discarded, since it duplicates the string that already exists in
        the entry. }

      else begin
        DeallocateString (NewEntryText);
        AppendToPageNumberList (Traverser^.Datum.Locations, NewPage);
        Continue := False
      end
  end;

  { The DeallocateEntryList procedure recycles all of the storage
    associated with a given EntryList, including the storage for the string
    and the list of page numbers inside the components of the EntryList. }

  procedure DeallocateEntryList (var Delend: EntryList);
  var
    Traverser: EntryLink;
      { points successively to each component of the EntryList to be
        deleted }
    Temporary: EntryLink;
      { a pointer to a component that is about to be deleted, immediately
        after Traverser has moved on }
  begin
    Traverser := Delend;
    while Traverser <> nil do begin
      DeallocateString (Traverser^.Datum.EntryText);
      DeallocatePageNumberList (Traverser^.Datum.Locations);
      Temporary := Traverser;
      Traverser := Traverser^.Next;
      Dispose (Temporary)
    end;
    Delend := nil
  end;

  { The CompileIndex procedure builds and returns a complete list of the
    marked strings in the input text, with their page numbers, unless a
    syntax error is encountered in the input, in which case it prints out
    the location of each syntax error it detects (but does not return an
    EntryList structure).  The ErrorFound argument indicates whether any
    syntax errors were detected. }

  procedure CompileIndex (var Index: EntryList; var ErrorFound: Boolean);
  const
    EntryMarker = '@';
      { the character that signals the beginning of an entry }
  type
    LineNumber = 1 .. MaxInt;
      { a positive integer to indicate which line of the current page the
        next character to be read notionally appears on }
  var
    CurrentPage: PageNumber;
      { the number of the page currently being read }
    CurrentLine: LineNumber;
      { the number of the line currently being read }
    CurrentEntryText: String;
      { the text of an item to be indexed }
    Success: Boolean;
      { indicates whether the text of an index entry was collected without
        an error }

    { The FormFeed function constructs and returns the form-feed
      character. }

    function FormFeed: Char;
    begin
      FormFeed := Chr (12);
    end;

    { The ReadEntry procedure attempts to collect the text of an entry
      from standard input.  If it is successful, the Success parameter is
      set to True and the text of the entry is returned through
      CurrentEntryText; otherwise, the Success parameter is set to False.
      The value of the CurrentLine parameter is incremented each time a
      line break is consumed from the input. }

    procedure ReadEntry (var CurrentEntryText: String;
      var CurrentLine: LineNumber; var Success: Boolean);
    const
      Starter = '(';
        { a marker indicating that the text of an entry may contain
          several words }
      Stopper = ')';
        { a marker terminating the text of an entry that may contain
          several words }
      Space = ' ';
        { a more legible representation of the space character }
    var
      Done: Boolean;
        { indicates whether the search for the end of the text of the
          current index entry can and should continue }
      Temporary: String;
        { temporary storage for a string to which a new character has been
          added }
    begin
      if EOF then

        { An entry must contain at least one character. }

        Success := False

      else if Input^ = Starter then begin

        { The beginning of an entry that may contain several words has
          been found.  Discard the Starter marker. }

        Get (Input);

        { Collect the text of the index entry; it ends just before the
          first occurrence of Stopper. }

        CurrentEntryText := NullString;
        Done := False;
        while not Done do

          { It is an error for the file to end before Stopper has been
            encountered. }

          if EOF then begin
            Success := False;
            Done := True
          end

          { A line break in an index entry is replaced with a space
            character. }

          else if EOLn then begin
            Temporary := AppendToString (CurrentEntryText, Space);
            DeallocateString (CurrentEntryText);
            CurrentEntryText := Temporary;
            ReadLn;
            CurrentLine := CurrentLine + 1
          end

          { It is an error for the EntryMarker, a page break, or another
            occurrence of the Starter character to appear inside an index
            entry. }

          else if Input^ in [EntryMarker, FormFeed, Starter] then begin
            Success := False;
            Done := True
          end

          { When the stopper is reached, discard it and stop collecting
            the text of the entry.  It is an error for the text to be the
            null string. }

          else if Input^ = Stopper then begin
            Get (Input);
            Success := not EmptyString (CurrentEntryText);
            Done := True
          end

          { In any other case, just add the next character to the text
            of the index entry. }

          else begin
            Temporary := AppendToString (CurrentEntryText, Input^);
            DeallocateString (CurrentEntryText);
            CurrentEntryText := Temporary;
            Get (Input)
          end;

        { Discard the string if a syntax error was detected. }

        if not Success then
          DeallocateString (CurrentEntryText)

      end
      else begin

        { Collect the text of the index entry; it ends just before the next
          line break or space.  }

        CurrentEntryText := NullString;
        Done := False;
        while not Done do

          { It is an error for the file to end before an appropriate
            terminator has been encountered. }

          if EOF then begin
            Success := False;
            Done := True
          end

          { A line break or a space terminates the text of the entry. }

          else if EOLn or (Input^ = Space) then begin
            Success := not EmptyString (CurrentEntryText);
            Done := True
          end

          { It is an error for the EntryMarker, a page break, or the
            Starter character to appear inside an index entry. }

          else if Input^ in [EntryMarker, FormFeed, Starter] then begin
            Success := False;
            Done := True
          end

          { In any other case, just add the next character to the text
            of the index entry. }

          else begin
            Temporary := AppendToString (CurrentEntryText, Input^);
            DeallocateString (CurrentEntryText);
            CurrentEntryText := Temporary;
            Get (Input)
          end;

        { Discard the string if a syntax error was detected. }

        if not Success then
          DeallocateString (CurrentEntryText)

      end
    end;

    { The ReportError procedure reports the location of a syntax error by
      writing it to the standard error output. }

    procedure ReportError (CurrentPage: PageNumber;
      CurrentLine: LineNumber);
    begin
      WriteLn (StdErr, 'A syntax error was encountered on line ',
               CurrentLine : 1, ' of page ', CurrentPage : 1,
               ' of the input.')
    end;

  begin { procedure CompileIndex }
    Index := CreateEntryList;
    ErrorFound := False;
    CurrentPage := 1;
    CurrentLine := 1;
    while not EOF do
      if EOLn then begin { start a new line }
        ReadLn;
        CurrentLine := CurrentLine + 1
      end
      else if Input^ = FormFeed then begin { start a new page }
        Get (Input);
        CurrentPage := CurrentPage + 1;
        CurrentLine := 1
      end
      else if Input^ = EntryMarker then begin { collect an entry }
        Get (Input);
        ReadEntry (CurrentEntryText, CurrentLine, Success);
        if not Success then begin
          ReportError (CurrentPage, CurrentLine);
          if not ErrorFound then begin
            ErrorFound := True;
            DeallocateEntryList (Index)
          end
        end
        else if ErrorFound then
          DeallocateString (CurrentEntryText)
        else
          AddToEntryList (CurrentEntryText, CurrentPage, Index)
      end
      else
        Get (Input)
  end;

  { The PrintIndex procedure writes out the alphabetical list of entries,
    each followed by a list of the pages on which it occurs. }

  procedure PrintIndex (Index: EntryList);
  const
    Space = ' ';
      { a more legible representation of the space character }
  var
    Traverser: EntryLink;
      { points successively to each component of Index }
  begin
    Traverser := Index;
    while Traverser <> nil do begin
      WriteString (Output, Traverser^.Datum.EntryText);
      Write (Space);
      WritePageNumberList (Output, Traverser^.Datum.Locations);
      WriteLn;
      Traverser := Traverser^.Next
    end
  end;

begin { main program }
  CompileIndex (Index, ErrorFound);
  if not ErrorFound then begin
    PrintIndex (Index);
    DeallocateEntryList (Index)
  end
end.

created December 2, 1996
last revised December 2, 1996

John David Stone (stone@math.grin.edu)