{ This program is a solution to exercise #4 for CSC 206, ``Fundamentals of
computer science II,'' offered at Grinnell College in fall semester,
1996.
The program examines and compares the career batting statistics of a
number of baseball players in order to determine which of them are most
similar to one another, in a precisely quantified sense. It reads the
statistics from a specified text file, /u2/stone/datasets/hitters.dat,
and writes its findings to standard output, in the form of an
alphabetized list of players in which each player's name is followed
by a list of other players that are found to be most similar to that
player.
Here are the main steps of the process:
1. Read each player's batting statistics into one record within an array;
if an error is encountered in the entry for a player in the input
file, report it to the user and discard the entry for that player.
2. Arrange the elements of the array in alphabetical order by player's
name.
3. For each player P in the array:
a. Compare P's statistics with that of every other player Q,
calculating a ``disparity score'' quantifying the difference
between P and Q. Keep track of the other players that have the
least disparity scores when compared to P -- the five most similar
players, if there are five or more other players, or else as many
other players as there are.
b. Write out P's name and the names of these most similar players,
with the disparity score for each.
Programmer: John Stone, Grinnell College.
Date of this version: October 9-13, 1996.
}
program SimilarHitters (HitterData, Output, StdErr);
const
NameLength = 22;
{ According to the specification, each player's name fits into a field
containing this many characters. }
MaximumNumberOfPlayers = 130;
{ the largest number of players that this program can accommodate; the
specification says that the data base is supposed to contain
information about this many players }
MaximumSimilarPlayers = 5;
{ the largest number of players to be identified as ``most similar'' to
a given player }
HitterDataFileName = '/u2/stone/datasets/hitters.dat';
{ the operating system's name for the file containing the batting
statistics for various players }
type
NameString = packed array [1 .. NameLength] of Char;
{ a standard Pascal string that can accommodate any player's name }
Natural = 0 .. MaxInt;
{ non-negative integers }
Player = record
Name: NameString;
AtBats: Natural;
Hits: Natural;
Doubles: Natural;
Triples: Natural;
HomeRuns: Natural;
RunsBattedIn: Natural;
BattingAverage: Natural; { in thousandths }
SluggingPercentage: Natural; { in thousandths }
OnBasePercentage: Natural { in thousandths }
end;
{ name and statistics about one player; the batting average, slugging
percentage, and on-base percentage are scaled up by a factor of one
thousand so that integer arithmetic can be used in computing the
disparity scores }
PlayerArray = array [1 .. MaximumNumberOfPlayers] of Player;
{ batting statistics about all of the players }
PlayerTable = record
Data: PlayerArray;
Size: Natural
end;
{ an array of player entries, together with an indication of how much
of that array is actually occupied by valid entries }
DisparityRecord = record
Name: NameString;
Disparity: Natural
end;
{ a player's name, together with the computed measure of the player's
difference from another player }
DisparityArray = array [1 .. MaximumSimilarPlayers] of DisparityRecord;
{ an array of player names and disparity scores }
DisparityTable = record
Data: DisparityArray;
Size: Natural
end;
{ an array of player names and disparity scores, together with an
indication of how much of that array is occupied }
var
HitterData: Text;
{ the file containing the batting statistics for various players }
Players: PlayerTable;
{ one player record for each entry in the source file }
PlayerNumber: Natural;
{ counts off the player records in the array }
SimilarPlayers: DisparityTable;
{ the table of players most similar to a given player }
{ The ReadStatistics procedure collects the batting statistics for all
the players from the input file. }
procedure ReadStatistics (var HitterData: Text;
var Players: PlayerTable);
var
LinesRead: Natural;
{ tallies the lines of the source file as they are read in }
Success: Boolean;
{ indicates whether the attempt to read in statistics about one
player from one line of the file was successful }
{ The ReadPlayer procedure attempts to collect batting statistics for
one player from the source file, returning it through the variable
parameter P. If a syntax error is detected in the source file, the
Success parameter is set to False; otherwise, it is set to True.
The correct format for the batting statistics about one player is
as follows:
Columns 1 through 22 contain the player's name, left-justified.
Column 23 is blank.
Column 24 through 28 contain the number of times the player batted,
right-justified.
Column 29 is blank.
Columns 30 through 33 contain the number of hits the player made,
right-justified.
Column 34 is blank.
Columns 35 through 37 contain the number of doubles the player made,
right-justified.
Column 38 is blank.
Columns 39 through 41 contain the number of triples the player made,
right-justified.
Column 42 is blank.
Columns 43 through 45 contain the number of home runs the player made,
right-justified.
Column 46 is blank.
Columns 47 through 50 contain the number of runs the player batted in,
right-justified.
Column 51 is blank.
Columns 52 through 55 contain the player's career batting average,
written as a decimal point followed by three digits.
Column 56 is blank.
Columns 57 through 60 contain the player's career slugging percentage,
written as a decimal point followed by three digits.
Column 61 is blank.
Columns 62 through 65 contain the player's career on-base percentage,
written as a decimal point followed by three digits.
The procedure enforces this format ruthlessly. }
procedure ReadPlayer (var HitterData: Text; var P: Player;
var Success: Boolean);
label 99;
{ an emergency exit from the procedure, in case a syntax error is
detected in the source file }
const
Space = ' ';
DecimalPoint = '.';
{ formatting characters in the source file }
AtBatsWidth = 5;
{ the number of columns occupied by the number of a player's
at-bats }
HitsWidth = 4;
{ the number of columns occupied by the number of a player's
hits }
DoublesWidth = 3;
{ the number of columns occupied by the number of a player's
doubles }
TriplesWidth = 3;
{ the number of columns occupied by the number of a player's
triples }
HomeRunsWidth = 3;
{ the number of columns occupied by the number of a player's
home runs }
RunsBattedInWidth = 4;
{ the number of columns occupied by the number of a player's
runs batted in }
BattingAverageWidth = 3;
{ the number of columns occupied by the fractional part of a
player's batting average }
SluggingPercentageWidth = 3;
{ the number of columns occupied by the fractional part of a
player's slugging percentage }
OnBasePercentageWidth = 3;
{ the number of columns occupied by the fractional part of a
player's on-base percentage }
{ The ReadName procedure tries to read in, from a specified source
file, exactly NameStringLength characters, without encountering
either the end of a line or the end of the file. If it succeeds,
the characters are stored in the parameter Legend and the parameter
Success is set to True; otherwise, Success is set to False and the
contents of Legend are undefined. }
procedure ReadName (var Source: Text; var Legend: NameString;
var Success: Boolean);
var
Position: Natural;
{ counts off characters as they are inserted into the NameString }
begin
Position := 0;
Success := True;
while Success and (Position < NameLength) do
if EOF (Source) then
Success := False
else if EOLn (Source) then
Success := False
else begin
Position := Position + 1;
Read (Source, Legend[Position])
end
end;
{ The Match procedure tries to read in a specified character from a
specified source file. It indicates whether it has succeeded by
setting the parameter Success. If the character is read, it is
discarded. }
procedure Match (var Source: Text; Sought: Char;
var Success: Boolean);
begin
if EOF (Source) then
Success := False
else if EOLn (Source) then
Success := False
else if Source^ = Sought then begin
Success := True;
Get (Source)
end
else
Success := False
end;
{ The ReadFixedWidthNatural tries to read in a natural number, which
must be right-justified in a field of specified width that is
otherwise occupied by spaces. If it succeeds, it stores the
natural number in the Legend parameter and sets the Success
parameter to True; otherwise, it sets Success to False and the
contents of Legend are undefined.
The procedure can fail for any of several reasons:
* The end of the input file is encountered.
* The end of an input line is encountered.
* The value of the numeral being read exceeds MaxInt.
* A character that is neither a space nor a digit is encountered.
* A space is encountered after the numeral has begun.
The procedure will stop as soon as any of these conditions is
detected, without consuming any erroneous character. }
procedure ReadFixedWidthNatural (var Source: Text; Width: Natural;
var Legend: Natural; var Success: Boolean);
var
Position: Natural;
{ counts off characters as they are read in }
DigitEncountered: Boolean;
{ indicates whether any digit characters have so far been
encountered in the input (if so, no more spaces should be
seen) }
Digit: Natural;
{ the numeric value of the next character of the source file,
known to be a digit character }
{ The IsDigit function determines whether the character it is given
is a digit character. }
function IsDigit (Ch: Char): Boolean;
begin
if (Ch < '0') then
IsDigit := False
else
IsDigit := (Ch <= '9')
end;
{ The DigitValue function takes a character that has been
determined to be a digit and returns its numerical value. }
function DigitValue (Ch: Char): Natural;
begin
DigitValue := Ord (Ch) - Ord ('0')
end;
{ The CanBeExtended function determines whether the natural number
that would result from an attempt to add an extra digit to a
given natural number exceeds MaxInt. It returns True if the
resulting number would not exceed MaxInt and so would still fit
in the Natural type defined above; it returns False if the
computation would cause an overflow. }
function CanBeExtended (Foundation: Natural; Extension: Natural):
Boolean;
begin
if Foundation < MaxInt div 10 then
CanBeExtended := True
else if MaxInt div 10 < Foundation then
CanBeExtended := False
else
CanBeExtended := (Extension <= MaxInt mod 10)
end;
begin { procedure ReadFixedWidthNatural }
Position := 0;
DigitEncountered := False;
Legend := 0;
Success := True;
while (Position < Width) and Success do
if EOF (Source) then
Success := False
else if EOLn (Source) then
Success := False
else if IsDigit (Source^) then begin
DigitEncountered := True;
Digit := DigitValue (Source^);
if CanBeExtended (Legend, Digit) then begin
Legend := Legend * 10 + Digit;
Get (Source);
Position := Position + 1
end
else
Success := False
end
else if Source^ <> Space then
Success := False
else if DigitEncountered then
Success := False
else begin
Get (Source);
Position := Position + 1
end;
if not DigitEncountered then
Success := False
end;
begin { procedure ReadPlayer }
ReadName (HitterData, P.Name, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, AtBatsWidth, P.AtBats, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, HitsWidth, P.Hits, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, DoublesWidth, P.Doubles, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, TriplesWidth, P.Triples, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, HomeRunsWidth, P.HomeRuns,
Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, RunsBattedInWidth, P.RunsBattedIn,
Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
Match (HitterData, DecimalPoint, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, BattingAverageWidth,
P.BattingAverage, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
Match (HitterData, DecimalPoint, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, SluggingPercentageWidth,
P.SluggingPercentage, Success);
if not Success then
goto 99;
Match (HitterData, Space, Success);
if not Success then
goto 99;
Match (HitterData, DecimalPoint, Success);
if not Success then
goto 99;
ReadFixedWidthNatural (HitterData, OnBasePercentageWidth,
P.OnBasePercentage, Success);
if not Success then
goto 99;
if not EOLn (HitterData) then
Success := False;
99:
end;
begin { procedure ReadStatistics }
Players.Size := 0;
LinesRead := 0;
while not EOF (HitterData) and
(Players.Size < MaximumNumberOfPlayers) do begin
ReadPlayer (HitterData, Players.Data[Players.Size + 1], Success);
ReadLn (HitterData);
LinesRead := LinesRead + 1;
if Success then
Players.Size := Players.Size + 1
else
WriteLn (StdErr, 'Line #', LinesRead : 1, ' of the source file ',
'contained incorrectly formatted data and was discarded.')
end;
if not EOF (HitterData) then begin
WriteLn (StdErr, 'The source file contained additional data after ',
'the first ', MaximumNumberOfPlayers : 1, ' valid entries.');
WriteLn (StdErr, 'These additional data were discarded.')
end
end;
{ The Alphabetize procedure rearranges the player records in a
PlayerTable so that they are in alphabetical order by player name. }
procedure Alphabetize (var Players: PlayerTable);
{ The Partition procedure runs through the elements in a specified
segment of an array, collecting those that precede a specified pivot
at the low-subscript end of the segment and shifting the rest to the
high-subscript end. The 'divider' parameter keeps track of the
position of the last element in the low-end partition; if there are
no elements in that partition, its value is set to one less than the
lower boundary of the entire array segment. }
procedure Partition (var Arr: PlayerArray; Start, Finish: Integer;
Pivot: Player; var Divider: Natural);
var
Position: Natural;
{ counts off the positions in the array segment, from Start to
Finish }
Temporary: Player;
{ temporary storage for a player record being moved from one
position to another }
begin
Divider := Start - 1;
for Position := Start to Finish do
if Arr[Position].Name < Pivot.Name then begin
Divider := Divider + 1;
Temporary := Arr[Position];
Arr[Position] := Arr[Divider];
Arr[Divider] := Temporary
end
end;
{ The Sort procedure rearranges the player records in the segment of
an array lying between the (inclusive) boundaries specified by its
Start and Finish parameters so that they are in alphabetical order
by player name. }
procedure Sort (var Arr: PlayerArray; Start, Finish: Natural);
var
Divider: Natural;
{ the highest-numbered position occupied by an element that
precedes Arr[Start]; if there is no such position, Divider =
Start }
Temporary: Player;
{ temporary storage for an element being moved from one position to
another }
begin
if Start < Finish then begin
Partition (Arr, Start + 1, Finish, Arr[Start], Divider);
Temporary := Arr[Start];
Arr[Start] := Arr[Divider];
Arr[Divider] := Temporary;
Sort (Arr, Start, Divider - 1);
Sort (Arr, Divider + 1, Finish)
end
end;
begin { procedure Alphabetize }
Sort (Players.Data, 1, Players.Size)
end;
{ The FindSimilarPlayers procedure compares one particular player, the
one whose position in the Players table is given by the PlayerNumber
parameter, with each of the other players, determines a disparity
score for each of them, and keeps track of the names and disparity
scores of each of the other players that are most similar to the
one in position PlayerNumber. }
procedure FindSimilarPlayers (Players: PlayerTable;
PlayerNumber: Natural; var SimilarPlayers: DisparityTable);
var
OtherPlayerNumber: Natural;
{ the position of the other player in the Players table }
{ The InsertOrDiscard procedure determines whether a given player's
disparity score is low enough to be placed in the table of ``most
similar'' players and, if so, correctly positions that player's
name and disparity score in the table.
Here's the plan: Initially, the given player is assigned a notional
position in the table, one greater than its current size. Then the
player's disparity score is compared with those of players already
in the table, starting with the highest such score. If the score
of a player in the table is equal to or greater than the score of
the newly arriving player, the comparisons stop; if the new player's
score is less, however, the old player's score is shifted down one
position in the table (or discarded, if it is already in the last
real position in the table) and the next comparison is made with at
the next lower position in the table. The process continues until
either it is halted in this way or all of the positions in the table
have been examined and all the old players shifted down.
At that point, if the new player has reached a position that is
actually in the table, his data are inserted into the table at that
position. If the new player remains in the original notional
position, he is added to the table only if it was not previously
full. The size of the table increases when the new player is added
if it was not previously full. }
procedure InsertOrDiscard (PlayerName: NameString; Score: Natural;
var SimilarPlayers: DisparityTable);
var
Position: Natural;
{ the notional position initially occupied by the new player }
Continue: Boolean;
{ indicates whether the comparisons can and should continue }
begin
with SimilarPlayers do begin
Position := Size + 1;
Continue := (1 < Position);
while Continue do
if Score < Data[Position - 1].Disparity then begin
if Position <= MaximumSimilarPlayers then
Data[Position] := Data[Position - 1];
Position := Position - 1;
Continue := (1 < Position)
end
else
Continue := False;
if Position <= MaximumSimilarPlayers then begin
Data[Position].Name := PlayerName;
Data[Position].Disparity := Score;
if Size < MaximumSimilarPlayers then
Size := Size + 1
end
end
end;
{ The DisparityScore function computes and returns a quantitative
estimate of the dissimilarity of two players, based on their batting
statistics. }
function DisparityScore (Primero, Segundo: Player): Natural;
const
AtBatWeight = 1;
HitWeight = 3;
DoubleWeight = 10;
TripleWeight = 15;
HomeRunWeight = 15;
RunsBattedInWeight = 6;
BattingAverageWeight = 12;
SluggingPercentageWeight = 8;
OnBasePercentageWeight = 9;
{ multipliers for the differences between corresponding performance
totals for the two players }
begin
DisparityScore :=
AtBatWeight * Abs (Primero.AtBats - Segundo.AtBats) +
HitWeight * Abs (Primero.Hits - Segundo.Hits) +
DoubleWeight * Abs (Primero.Doubles - Segundo.Doubles) +
TripleWeight * Abs (Primero.Triples - Segundo.Triples) +
HomeRunWeight * Abs (Primero.HomeRuns - Segundo.HomeRuns) +
RunsBattedInWeight * Abs (Primero.RunsBattedIn -
Segundo.RunsBattedIn) +
BattingAverageWeight * Abs (Primero.BattingAverage -
Segundo.BattingAverage) +
SluggingPercentageWeight * Abs (Primero.SluggingPercentage -
Segundo.SluggingPercentage) +
OnBasePercentageWeight * Abs (Primero.OnBasePercentage -
Segundo.OnBasePercentage)
end;
begin { procedure FindSimilarPlayers }
SimilarPlayers.Size := 0;
for OtherPlayerNumber := 1 to Players.Size do
if OtherPlayerNumber <> PlayerNumber then
InsertOrDiscard (Players.Data[OtherPlayerNumber].Name,
DisparityScore (Players.Data[PlayerNumber],
Players.Data[OtherPlayerNumber]),
SimilarPlayers)
end;
{ The DisplaySimilarPlayers procedure prints out the name of a given
player and the names and disparity scores of the other players most
similar to him. }
procedure DisplaySimilarPlayers (Given: Player;
SimilarPlayers: DisparityTable);
var
OtherPlayerNumber: Natural;
{ the position of one of the similar players in the SimilarPlayers
table }
{ The length function computes and returns the length of a string,
not including any trailing spaces. }
function Length (S: NameString): Natural;
const
Space = ' ';
{ a legible name for the space character }
var
Position: Natural;
{ counts down through the positions in the string }
Continue: Boolean;
{ indicates whether the search for a non-space character at the end
of the string can and should continue }
begin
Position := NameLength;
Continue := True;
while Continue do
if Position = 0 then
Continue := False
else if S[Position] = Space then
Position := Position - 1
else
Continue := False;
Length := Position
end;
begin { procedure DisplaySimilarPlayers }
WriteLn ('Most similar to ', Given.Name : Length (Given.Name), ':');
with SimilarPlayers do
for OtherPlayerNumber := 1 to Size do
with Data[OtherPlayerNumber] do
WriteLn (' ', Name : Length (Name), ' (disparity ',
Disparity : 1, ')');
WriteLn
end;
begin { main program }
Reset (HitterData, HitterDataFileName);
ReadStatistics (HitterData, Players);
Alphabetize (Players);
for PlayerNumber := 1 to Players.Size do begin
FindSimilarPlayers (Players, PlayerNumber, SimilarPlayers);
DisplaySimilarPlayers (Players.Data[PlayerNumber], SimilarPlayers)
end
end.
created October 28, 1996
last revised October 28, 1996
John David Stone
(stone@math.grin.edu)