7.12. Merging Scanner And Parser

Now that we've covered all of the theory and general aspects of lexical scanning that we'll be needing, I'm finally ready to back up my claim that we can accomodate multi-character tokens with minimal change to our previous work. To keep things short and simple I will restrict myself here to a subset of what we've done before; I'm allowing only one control construct (the IF) and no Boolean expressions. That's enough to demonstrate the parsing of both keywords and expressions. The extension to the full set of constructs should be pretty apparent from what we've already done.

All the elements of the program to parse this subset, using single-character tokens, exist already in our previous programs. I built it by judicious copying of these files, but I wouldn't dare try to lead you through that process. Instead, to avoid any confusion, the whole program is shown below:

Example 7.1. KISS v.0

program KISS;

{ Constant Declarations }
const TAB = ^I;
      CR  = ^M;
      LF  = ^J;

{ Type Declarations  }
type Symbol = string[8];
     SymTab = array[1..1000] of Symbol;
     TabPtr = ^SymTab;

{ Variable Declarations }
var Look  : char;              { Lookahead Character }
    Lcount: integer;           { Label Counter       }

{ Read New Character From Input Stream }
procedure GetChar;
begin
   Read(Look);
end;

{ Report an Error }
procedure Error(s: string);
begin
   WriteLn;
   WriteLn(^G, 'Error: ', s, '.');
end;

{ Report Error and Halt }
procedure Abort(s: string);
begin
   Error(s);
   Halt;
end;

{ Report What Was Expected }
procedure Expected(s: string);
begin
   Abort(s + ' Expected');
end;

{ Recognize an Alpha Character }
function IsAlpha(c: char): boolean;
begin
   IsAlpha := UpCase(c) in ['A'..'Z'];
end;

{ Recognize a Decimal Digit }
function IsDigit(c: char): boolean;
begin
   IsDigit := c in ['0'..'9'];
end;

{ Recognize an AlphaNumeric Character }
function IsAlNum(c: char): boolean;
begin
   IsAlNum := IsAlpha(c) or IsDigit(c);
end;
                             
{ Recognize an Addop }
function IsAddop(c: char): boolean;
begin
   IsAddop := c in ['+', '-'];
end;

{ Recognize a Mulop }
function IsMulop(c: char): boolean;
begin
   IsMulop := c in ['*', '/'];
end;

{ Recognize White Space }
function IsWhite(c: char): boolean;
begin
   IsWhite := c in [' ', TAB];
end;

{ Skip Over Leading White Space }
procedure SkipWhite;
begin
   while IsWhite(Look) do
      GetChar;
end;

{ Match a Specific Input Character }
procedure Match(x: char);
begin
   if Look <> x then Expected('''' + x + '''');
   GetChar;
   SkipWhite;
end;

{ Skip a CRLF }
procedure Fin;
begin
   if Look = CR then GetChar;
   if Look = LF then GetChar;
   SkipWhite;
end;

{ Get an Identifier }
function GetName: char;
begin
   while Look = CR do
      Fin;
   if not IsAlpha(Look) then Expected('Name');
   Getname := UpCase(Look);
   GetChar;
   SkipWhite;
end;

{ Get a Number }
function GetNum: char;
begin
   if not IsDigit(Look) then Expected('Integer');
   GetNum := Look;
   GetChar;
   SkipWhite;
end;

{ Generate a Unique Label }
function NewLabel: string;
var S: string;
begin
   Str(LCount, S);
   NewLabel := 'L' + S;
   Inc(LCount);
end;

{ Post a Label To Output }
procedure PostLabel(L: string);
begin
   WriteLn(L, ':');
end;

{ Output a String with Tab }
procedure Emit(s: string);
begin
   Write(TAB, s);
end;

{ Output a String with Tab and CRLF }
procedure EmitLn(s: string);
begin
   Emit(s);
   WriteLn;
end;

{ Parse and Translate an Identifier }
procedure Ident;
var Name: char;
begin
   Name := GetName;
   if Look = '(' then begin
      Match('(');
      Match(')');
      EmitLn('BSR ' + Name);
      end
   else
      EmitLn('MOVE ' + Name + '(PC),D0');
end;

{ Parse and Translate a Math Factor }
procedure Expression; Forward;
procedure Factor;
begin
   if Look = '(' then begin
      Match('(');
      Expression;
      Match(')');
      end
   else if IsAlpha(Look) then
      Ident
   else
      EmitLn('MOVE #' + GetNum + ',D0');
end;

{ Parse and Translate the First Math Factor }
procedure SignedFactor;
var s: boolean;
begin
   s := Look = '-';
   if IsAddop(Look) then begin
      GetChar;
      SkipWhite;
   end;
   Factor;
   if s then
      EmitLn('NEG D0');
end;

{ Recognize and Translate a Multiply }
procedure Multiply;
begin
   Match('*');
   Factor;
   EmitLn('MULS (SP)+,D0');
end;

{ Recognize and Translate a Divide }
procedure Divide;
begin
   Match('/');
   Factor;
   EmitLn('MOVE (SP)+,D1');
   EmitLn('EXS.L D0');
   EmitLn('DIVS D1,D0');
end;

{ Completion of Term Processing  (called by Term and FirstTerm }
procedure Term1;
begin
   while IsMulop(Look) do begin
      EmitLn('MOVE D0,-(SP)');
      case Look of
       '*': Multiply;
       '/': Divide;
      end;
   end;
end;

{ Parse and Translate a Math Term }
procedure Term;
begin
   Factor;
   Term1;
end;

{ Parse and Translate a Math Term with Possible Leading Sign }
procedure FirstTerm;
begin
   SignedFactor;
   Term1;
end;

{ Recognize and Translate an Add }
procedure Add;
begin
   Match('+');
   Term;
   EmitLn('ADD (SP)+,D0');
end;

{ Recognize and Translate a Subtract }
procedure Subtract;
begin
   Match('-');
   Term;
   EmitLn('SUB (SP)+,D0');
   EmitLn('NEG D0');
end;

{ Parse and Translate an Expression }
procedure Expression;
begin
   FirstTerm;
   while IsAddop(Look) do begin
      EmitLn('MOVE D0,-(SP)');
      case Look of
       '+': Add;
       '-': Subtract;
      end;
   end;
end;

{ Parse and Translate a Boolean Condition }
{ This version is a dummy }
Procedure Condition;
begin
   EmitLn('Condition');
end;

{ Recognize and Translate an IF Construct }
procedure Block; Forward;
procedure DoIf;
var L1, L2: string;
begin
   Match('i');
   Condition;
   L1 := NewLabel;
   L2 := L1;
   EmitLn('BEQ ' + L1);
   Block;
   if Look = 'l' then begin
      Match('l');
      L2 := NewLabel;
      EmitLn('BRA ' + L2);
      PostLabel(L1);
      Block;
   end;
   PostLabel(L2);
   Match('e');
end;

{ Parse and Translate an Assignment Statement }
procedure Assignment;
var Name: char;
begin
   Name := GetName;
   Match('=');
   Expression;
   EmitLn('LEA ' + Name + '(PC),A0');
   EmitLn('MOVE D0,(A0)');
end;

{ Recognize and Translate a Statement Block }
procedure Block;
begin
   while not(Look in ['e', 'l']) do begin
      case Look of
       'i': DoIf;
       CR: while Look = CR do
              Fin;
       else Assignment;
      end;
   end;
end;

{ Parse and Translate a Program }
procedure DoProgram;
begin
   Block;
   if Look <> 'e' then Expected('END');
   EmitLn('END')
end;

{ Initialize }
procedure Init;
begin
   LCount := 0;
   GetChar;
end;

{ Main Program }
begin
   Init;
   DoProgram;
end.

A couple of comments:

  1. The form for the expression parser, using FirstTerm, etc., is a little different from what you've seen before. It's yet another variation on the same theme. Don't let it throw you … the change is not required for what follows.
  2. Note that, as usual, I had to add calls to Fin at strategic spots to allow for multiple lines.

Before we proceed to adding the scanner, first copy this file and verify that it does indeed parse things correctly. Don't forget the “codes”: i for IF, l for ELSE, and e for END or ENDIF.

If the program works, then let's press on. In adding the scanner modules to the program, it helps to have a systematic plan. In all the parsers we've written to date, we've stuck to a convention that the current lookahead character should always be a non-blank character. We preload the lookahead character in Init, and keep the “pump primed” after that. To keep the thing working right at newlines, we had to modify this a bit and treat the newline as a legal token.

In the multi-character version, the rule is similar: The current lookahead character should always be left at the beginning of the next token, or at a newline.

The multi-character version is shown next. To get it, I've made the following changes:

Here is the program in its entirety:

Example 7.2. KISS

program KISS;
                             
{ Constant Declarations }

const TAB = ^I;
      CR  = ^M;
      LF  = ^J;

{ Type Declarations  }
type Symbol = string[8];
     SymTab = array[1..1000] of Symbol;
     TabPtr = ^SymTab;


{ Variable Declarations }
var Look  : char;              { Lookahead Character }
    Token : char;              { Encoded Token       }
    Value : string[16];        { Unencoded Token     }
    Lcount: integer;           { Label Counter       }


{ Definition of Keywords and Token Types }
const KWlist: array [1..4] of Symbol =
              ('IF', 'ELSE', 'ENDIF', 'END');
const KWcode: string[5] = 'xilee';


{ Read New Character From Input Stream }
procedure GetChar;
begin
   Read(Look);
end;

{ Report an Error }
procedure Error(s: string);
begin
   WriteLn;
   WriteLn(^G, 'Error: ', s, '.');
end;


{ Report Error and Halt }
procedure Abort(s: string);
begin
   Error(s);
   Halt;
end;


{ Report What Was Expected }
procedure Expected(s: string);
begin
   Abort(s + ' Expected');
end;


{ Recognize an Alpha Character }
function IsAlpha(c: char): boolean;
begin
   IsAlpha := UpCase(c) in ['A'..'Z'];
end;


{ Recognize a Decimal Digit }
function IsDigit(c: char): boolean;
begin
   IsDigit := c in ['0'..'9'];
end;


{ Recognize an AlphaNumeric Character }
function IsAlNum(c: char): boolean;
begin
   IsAlNum := IsAlpha(c) or IsDigit(c);
end;


{ Recognize an Addop }
function IsAddop(c: char): boolean;
begin
   IsAddop := c in ['+', '-'];
end;


{ Recognize a Mulop }
function IsMulop(c: char): boolean;
begin
   IsMulop := c in ['*', '/'];
end;


{ Recognize White Space }
function IsWhite(c: char): boolean;
begin
   IsWhite := c in [' ', TAB];
end;


{ Skip Over Leading White Space }
procedure SkipWhite;
begin
   while IsWhite(Look) do
      GetChar;
end;


{ Match a Specific Input Character }
procedure Match(x: char);
begin
   if Look <> x then Expected('''' + x + '''');
   GetChar;
   SkipWhite;
end;


{ Skip a CRLF }
procedure Fin;
begin
   if Look = CR then GetChar;
   if Look = LF then GetChar;
   SkipWhite;
end;


{ Table Lookup }
function Lookup(T: TabPtr; s: string; n: integer): integer;
var i: integer;
    found: boolean;
begin
   found := false;
   i := n;
   while (i > 0) and not found do
      if s = T^[i] then
         found := true
      else
         dec(i);
   Lookup := i;
end;


{ Get an Identifier }
procedure GetName;
begin
   while Look = CR do
      Fin;
   if not IsAlpha(Look) then Expected('Name');
   Value := '';
   while IsAlNum(Look) do begin
     Value := Value + UpCase(Look);
     GetChar;
   end;
   SkipWhite;
end;


{ Get a Number }
procedure GetNum;
begin
   if not IsDigit(Look) then Expected('Integer');
   Value := '';
   while IsDigit(Look) do begin
     Value := Value + Look;
     GetChar;
   end;
   Token := '#';
   SkipWhite;
end;


{ Get an Identifier and Scan it for Keywords }
procedure Scan;
begin
   GetName;
   Token := KWcode[Lookup(Addr(KWlist), Value, 4) + 1];
end;
                             

{ Match a Specific Input String }
procedure MatchString(x: string);
begin
   if Value <> x then Expected('''' + x + '''');
end;


{ Generate a Unique Label }
function NewLabel: string;
var S: string;
begin
   Str(LCount, S);
   NewLabel := 'L' + S;
   Inc(LCount);
end;


{ Post a Label To Output }
procedure PostLabel(L: string);
begin
   WriteLn(L, ':');
end;


{ Output a String with Tab }
procedure Emit(s: string);
begin
   Write(TAB, s);
end;


{ Output a String with Tab and CRLF }
procedure EmitLn(s: string);
begin
   Emit(s);
   WriteLn;
end;


{ Parse and Translate an Identifier }
procedure Ident;
begin
   GetName;
   if Look = '(' then begin
      Match('(');
      Match(')');
      EmitLn('BSR ' + Value);
      end
   else
      EmitLn('MOVE ' + Value + '(PC),D0');
end;


{ Parse and Translate a Math Factor }
procedure Expression; Forward;
procedure Factor;
begin
   if Look = '(' then begin
      Match('(');
      Expression;
      Match(')');
      end
   else if IsAlpha(Look) then
      Ident
   else begin
      GetNum;
      EmitLn('MOVE #' + Value + ',D0');
   end;
end;


{ Parse and Translate the First Math Factor }
procedure SignedFactor;
var s: boolean;
begin
   s := Look = '-';
   if IsAddop(Look) then begin
      GetChar;
      SkipWhite;
   end;
   Factor;
   if s then
      EmitLn('NEG D0');
end;


{ Recognize and Translate a Multiply }
procedure Multiply;
begin
   Match('*');
   Factor;
   EmitLn('MULS (SP)+,D0');
end;


{ Recognize and Translate a Divide }
procedure Divide;
begin
   Match('/');
   Factor;
   EmitLn('MOVE (SP)+,D1');
   EmitLn('EXS.L D0');
   EmitLn('DIVS D1,D0');
end;


{ Completion of Term Processing  (called by Term and FirstTerm }
procedure Term1;
begin
   while IsMulop(Look) do begin
      EmitLn('MOVE D0,-(SP)');
      case Look of
       '*': Multiply;
       '/': Divide;
      end;
   end;
end;


{ Parse and Translate a Math Term }
procedure Term;
begin
   Factor;
   Term1;
end;


{ Parse and Translate a Math Term with Possible Leading Sign }
procedure FirstTerm;
begin
   SignedFactor;
   Term1;
end;


{ Recognize and Translate an Add }
procedure Add;
begin
   Match('+');
   Term;
   EmitLn('ADD (SP)+,D0');
end;


{ Recognize and Translate a Subtract }
procedure Subtract;
begin
   Match('-');
   Term;
   EmitLn('SUB (SP)+,D0');
   EmitLn('NEG D0');
end;


{ Parse and Translate an Expression }
procedure Expression;
begin
   FirstTerm;
   while IsAddop(Look) do begin
      EmitLn('MOVE D0,-(SP)');
      case Look of
       '+': Add;
       '-': Subtract;
      end;
   end;
end;


{ Parse and Translate a Boolean Condition }
{ This version is a dummy }
Procedure Condition;
begin
   EmitLn('Condition');
end;


{ Recognize and Translate an IF Construct }
procedure Block; Forward;
procedure DoIf;
var L1, L2: string;
begin
   Condition;
   L1 := NewLabel;
   L2 := L1;
   EmitLn('BEQ ' + L1);
   Block;
   if Token = 'l' then begin
      L2 := NewLabel;
      EmitLn('BRA ' + L2);
      PostLabel(L1);
      Block;
   end;
   PostLabel(L2);
   MatchString('ENDIF');
end;


{ Parse and Translate an Assignment Statement }
procedure Assignment;
var Name: string;
begin
   Name := Value;
   Match('=');
   Expression;
   EmitLn('LEA ' + Name + '(PC),A0');
   EmitLn('MOVE D0,(A0)');
end;


{ Recognize and Translate a Statement Block }
procedure Block;
begin
   Scan;
   while not (Token in ['e', 'l']) do begin
      case Token of
       'i': DoIf;
       else Assignment;
      end;
      Scan;
   end;
end;


{ Parse and Translate a Program }
procedure DoProgram;
begin
   Block;
   MatchString('END');
   EmitLn('END')
end;


{ Initialize }
procedure Init;
begin
   LCount := 0;
   GetChar;
end;


{ Main Program }
begin
   Init;
   DoProgram;
end.

Compare this program with its single-character counterpart. I think you will agree that the differences are minor.