Page Menu
Home
HEPForge
Search
Configure Global Search
Log In
Files
F10881594
parsing.nw
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
174 KB
Subscribers
None
parsing.nw
View Options
% -*- ess-noweb-default-code-mode: f90-mode; noweb-default-code-mode: f90-mode; -*-
% WHIZARD code as NOWEB source: text handling and parsing
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Text handling}
\includemodulegraph{parsing}
\whizard\ has to handle complex structures in input (and output) data.
Doing this in a generic and transparent way requires a generic lexer
and parser. The necessary modules are implemented here:
\begin{description}
\item[ifiles]
Implementation of line-oriented internal files in a more flexible
way (linked lists of variable-length strings) than the Fortran builtin
features.
\item[lexers]
Read text and transform it into a token stream.
\item[syntax\_rules]
Define the rules for interpreting tokens, to be used by the WHIZARD parser.
\item[parser]
Categorize tokens (keyword, string, number etc.) and use a set of
syntax rules to transform the input into a parse tree.
\item[xml]
Read and parse XML text, separate from the WHIZARD parser.
\end{description}
@
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Internal files}
The internal files introduced here ([[ifile]]) are a replacement for
the built-in internal files, which are fixed-size arrays of
fixed-length character strings. The [[ifile]] type is a doubly-linked
list of variable-length character strings with line numbers.
<<[[ifiles.f90]]>>=
<<File header>>
module ifiles
<<Use strings>>
use io_units
use system_defs, only: EOF
<<Standard module head>>
<<Ifiles: public>>
<<Ifiles: types>>
<<Ifiles: interfaces>>
contains
<<Ifiles: subroutines>>
end module ifiles
@ %def ifiles
@
\subsection{The line type}
The line entry type is for internal use, it is the list entry to be
collected in an [[ifile]] object.
<<Ifiles: types>>=
type :: line_entry_t
private
type(line_entry_t), pointer :: previous => null ()
type(line_entry_t), pointer :: next => null ()
type(string_t) :: string
integer :: index
end type line_entry_t
@ %def line_entry_t
@ Create a new list entry, given a varying string as input. The line
number and pointers are not set, these make sense only within an
[[ifile]].
<<Ifiles: subroutines>>=
subroutine line_entry_create (line, string)
type(line_entry_t), pointer :: line
type(string_t), intent(in) :: string
allocate (line)
line%string = string
end subroutine line_entry_create
@ %def line_entry_create
@ Destroy a single list entry: Since the pointer components should not
be deallocated explicitly, just deallocate the object itself.
<<Ifiles: subroutines>>=
subroutine line_entry_destroy (line)
type(line_entry_t), pointer :: line
deallocate (line)
end subroutine line_entry_destroy
@ %def line_entry_destroy
@
\subsection{The ifile type}
The internal file is a linked list of line entries.
<<Ifiles: public>>=
public :: ifile_t
<<Ifiles: types>>=
type :: ifile_t
private
type(line_entry_t), pointer :: first => null ()
type(line_entry_t), pointer :: last => null ()
integer :: n_lines = 0
contains
<<Ifiles: ifile: TBP>>
end type ifile_t
@ %def ifile_t
@ We need no explicit initializer, but a routine which recursively
deallocates the contents may be appropriate. After this, existing
line pointers may become undefined, so they should be nullified before
the file is destroyed.
<<Ifiles: public>>=
public :: ifile_clear
<<Ifiles: subroutines>>=
subroutine ifile_clear (ifile)
class(ifile_t), intent(inout) :: ifile
type(line_entry_t), pointer :: current
do while (associated (ifile%first))
current => ifile%first
ifile%first => current%next
call line_entry_destroy (current)
end do
nullify (ifile%last)
ifile%n_lines = 0
end subroutine ifile_clear
@ %def ifile_clear
@ The finalizer is just an alias for the above.
<<Ifiles: public>>=
public :: ifile_final
<<Ifiles: ifile: TBP>>=
procedure :: final => ifile_clear
<<Ifiles: interfaces>>=
interface ifile_final
module procedure ifile_clear
end interface
@ %def ifile_final
@
\subsection{I/O on ifiles}
Fill an ifile from an ordinary external file, i.e., I/O unit. If the
ifile is not empty, the old contents will be destroyed. We can read
a fixed-length character string, an ISO varying string, an ordinary
internal file (character-string array), or from an external unit. In
the latter case, lines are appended until EOF is reached. Finally,
there is a variant which reads from another ifile, effectively copying it.
<<Ifiles: public>>=
public :: ifile_read
<<Ifiles: interfaces>>=
interface ifile_read
module procedure ifile_read_from_string
module procedure ifile_read_from_char
module procedure ifile_read_from_unit
module procedure ifile_read_from_char_array
module procedure ifile_read_from_ifile
end interface
<<Ifiles: subroutines>>=
subroutine ifile_read_from_string (ifile, string)
type(ifile_t), intent(inout) :: ifile
type(string_t), intent(in) :: string
call ifile_clear (ifile)
call ifile_append (ifile, string)
end subroutine ifile_read_from_string
subroutine ifile_read_from_char (ifile, char)
type(ifile_t), intent(inout) :: ifile
character(*), intent(in) :: char
call ifile_clear (ifile)
call ifile_append (ifile, char)
end subroutine ifile_read_from_char
subroutine ifile_read_from_char_array (ifile, char)
type(ifile_t), intent(inout) :: ifile
character(*), dimension(:), intent(in) :: char
call ifile_clear (ifile)
call ifile_append (ifile, char)
end subroutine ifile_read_from_char_array
subroutine ifile_read_from_unit (ifile, unit, iostat)
type(ifile_t), intent(inout) :: ifile
integer, intent(in) :: unit
integer, intent(out), optional :: iostat
call ifile_clear (ifile)
call ifile_append (ifile, unit, iostat)
end subroutine ifile_read_from_unit
subroutine ifile_read_from_ifile (ifile, ifile_in)
type(ifile_t), intent(inout) :: ifile
type(ifile_t), intent(in) :: ifile_in
call ifile_clear (ifile)
call ifile_append (ifile, ifile_in)
end subroutine ifile_read_from_ifile
@ %def ifile_read
@ Append to an ifile. The same as reading, but without resetting the
ifile. In addition, there is a routine for appending a whole ifile.
<<Ifiles: public>>=
public :: ifile_append
<<Ifiles: ifile: TBP>>=
generic :: append => &
ifile_append_from_char
procedure, private :: ifile_append_from_char
<<Ifiles: interfaces>>=
interface ifile_append
module procedure ifile_append_from_string
module procedure ifile_append_from_char
module procedure ifile_append_from_unit
module procedure ifile_append_from_char_array
module procedure ifile_append_from_ifile
end interface
<<Ifiles: subroutines>>=
subroutine ifile_append_from_string (ifile, string)
class(ifile_t), intent(inout) :: ifile
type(string_t), intent(in) :: string
type(line_entry_t), pointer :: current
call line_entry_create (current, string)
current%index = ifile%n_lines + 1
if (associated (ifile%last)) then
current%previous => ifile%last
ifile%last%next => current
else
ifile%first => current
end if
ifile%last => current
ifile%n_lines = current%index
end subroutine ifile_append_from_string
subroutine ifile_append_from_char (ifile, char)
class(ifile_t), intent(inout) :: ifile
character(*), intent(in) :: char
call ifile_append_from_string (ifile, var_str (trim (char)))
end subroutine ifile_append_from_char
subroutine ifile_append_from_char_array (ifile, char)
class(ifile_t), intent(inout) :: ifile
character(*), dimension(:), intent(in) :: char
integer :: i
do i = 1, size (char)
call ifile_append_from_string (ifile, var_str (trim (char(i))))
end do
end subroutine ifile_append_from_char_array
subroutine ifile_append_from_unit (ifile, unit, iostat)
class(ifile_t), intent(inout) :: ifile
integer, intent(in) :: unit
integer, intent(out), optional :: iostat
type(string_t) :: buffer
integer :: ios
ios = 0
READ_LOOP: do
call get (unit, buffer, iostat = ios)
if (ios == EOF .or. ios > 0) exit READ_LOOP
call ifile_append_from_string (ifile, buffer)
end do READ_LOOP
if (present (iostat)) then
iostat = ios
else if (ios > 0) then
call get (unit, buffer) ! trigger error again
end if
end subroutine ifile_append_from_unit
subroutine ifile_append_from_ifile (ifile, ifile_in)
class(ifile_t), intent(inout) :: ifile
type(ifile_t), intent(in) :: ifile_in
type(line_entry_t), pointer :: current
current => ifile_in%first
do while (associated (current))
call ifile_append_from_string (ifile, current%string)
current => current%next
end do
end subroutine ifile_append_from_ifile
@ %def ifile_append
@ Write the ifile contents to an external unit
<<Ifiles: public>>=
public :: ifile_write
<<Ifiles: subroutines>>=
subroutine ifile_write (ifile, unit, iostat)
type(ifile_t), intent(in) :: ifile
integer, intent(in), optional :: unit
integer, intent(out), optional :: iostat
integer :: u
type(line_entry_t), pointer :: current
u = given_output_unit (unit); if (u < 0) return
current => ifile%first
do while (associated (current))
call put_line (u, current%string, iostat)
current => current%next
end do
end subroutine ifile_write
@ %def ifile_write
@ Convert the ifile to an array of strings, which is allocated by this
function:
<<Ifiles: public>>=
public :: ifile_to_string_array
<<Ifiles: subroutines>>=
subroutine ifile_to_string_array (ifile, string)
type(ifile_t), intent(in) :: ifile
type(string_t), dimension(:), intent(inout), allocatable :: string
type(line_entry_t), pointer :: current
integer :: i
allocate (string (ifile_get_length (ifile)))
current => ifile%first
do i = 1, ifile_get_length (ifile)
string(i) = current%string
current => current%next
end do
end subroutine ifile_to_string_array
@ %def ifile_to_string_array
@
\subsection{Ifile tools}
<<Ifiles: public>>=
public :: ifile_get_length
<<Ifiles: subroutines>>=
function ifile_get_length (ifile) result (length)
integer :: length
type(ifile_t), intent(in) :: ifile
length = ifile%n_lines
end function ifile_get_length
@ %def ifile_get_length
@
\subsection{Line pointers}
Instead of the implicit pointer used in ordinary file access, we
define explicit pointers, so there can be more than one at a time.
<<Ifiles: public>>=
public :: line_p
<<Ifiles: types>>=
type :: line_p
private
type(line_entry_t), pointer :: p => null ()
end type line_p
@ %def line
@ Assign a file pointer to the first or last line in an ifile:
<<Ifiles: public>>=
public :: line_init
<<Ifiles: subroutines>>=
subroutine line_init (line, ifile, back)
type(line_p), intent(inout) :: line
type(ifile_t), intent(in) :: ifile
logical, intent(in), optional :: back
if (present (back)) then
if (back) then
line%p => ifile%last
else
line%p => ifile%first
end if
else
line%p => ifile%first
end if
end subroutine line_init
@ %def line_init
@ Remove the pointer association:
<<Ifiles: public>>=
public :: line_final
<<Ifiles: subroutines>>=
subroutine line_final (line)
type(line_p), intent(inout) :: line
nullify (line%p)
end subroutine line_final
@ %def line_final
@ Go one step forward
<<Ifiles: public>>=
public :: line_advance
<<Ifiles: subroutines>>=
subroutine line_advance (line)
type(line_p), intent(inout) :: line
if (associated (line%p)) line%p => line%p%next
end subroutine line_advance
@ %def line_advance
@ Go one step backward
<<Ifiles: public>>=
public :: line_backspace
<<Ifiles: subroutines>>=
subroutine line_backspace (line)
type(line_p), intent(inout) :: line
if (associated (line%p)) line%p => line%p%previous
end subroutine line_backspace
@ %def line_backspace
@ Check whether we are accessing a valid line
<<Ifiles: public>>=
public :: line_is_associated
<<Ifiles: subroutines>>=
function line_is_associated (line) result (ok)
logical :: ok
type(line_p), intent(in) :: line
ok = associated (line%p)
end function line_is_associated
@ %def line_is_associated
@
\subsection{Access lines via pointers}
We do not need the ifile as an argument to these functions, because
the [[line]] type will point to an existing ifile.
<<Ifiles: public>>=
public :: line_get_string
<<Ifiles: subroutines>>=
function line_get_string (line) result (string)
type(string_t) :: string
type(line_p), intent(in) :: line
if (associated (line%p)) then
string = line%p%string
else
string = ""
end if
end function line_get_string
@ %def line_get_string
@ Variant where the line pointer is advanced after reading.
<<Ifiles: public>>=
public :: line_get_string_advance
<<Ifiles: subroutines>>=
function line_get_string_advance (line) result (string)
type(string_t) :: string
type(line_p), intent(inout) :: line
if (associated (line%p)) then
string = line%p%string
call line_advance (line)
else
string = ""
end if
end function line_get_string_advance
@ %def line_get_string_advance
<<Ifiles: public>>=
public :: line_get_index
<<Ifiles: subroutines>>=
function line_get_index (line) result (index)
integer :: index
type(line_p), intent(in) :: line
if (associated (line%p)) then
index = line%p%index
else
index = 0
end if
end function line_get_index
@ %def line_get_index
<<Ifiles: public>>=
public :: line_get_length
<<Ifiles: subroutines>>=
function line_get_length (line) result (length)
integer :: length
type(line_p), intent(in) :: line
if (associated (line%p)) then
length = len (line%p%string)
else
length = 0
end if
end function line_get_length
@ %def line_get_length
@
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Lexer}
The lexer purpose is to read from a line-separated character input
stream (usually a file) and properly chop the stream into lexemes
(tokens). [The parser will transform lexemes into meaningful tokens,
to be stored in a parse tree, therefore we do not use the term 'token'
here.] The input is read line-by-line, but interpreted free-form,
except for quotes and the comment syntax. (Fortran 2003 would allow
us to use a stream type for reading.)
In an object-oriented approach, we can dynamically create and destroy
lexers, including the lexer setup.
The main lexer function is to return a lexeme according to the basic
lexer rules (quotes, comments, whitespace, special classes). There is
also a routine to write back a lexeme to the input stream (but only
once).
For the rules, we separate the possible characters into classes.
Whitespace usually consists of blank, tab, and line-feed, where any
number of consecutive whitespace is equivalent to one. Quoted strings
are enclosed by a pair of quote characters, possibly multiline.
Comments are similar to quotes, but interpreted as whitespace.
Numbers are identified (not distinguishing real and integer) but not
interpreted. Other character classes make up identifiers.
<<[[lexers.f90]]>>=
<<File header>>
module lexers
<<Use strings>>
use io_units
use string_utils
use system_defs, only: EOF, EOR
use system_defs, only: LF
use system_defs, only: WHITESPACE_CHARS, LCLETTERS, UCLETTERS, DIGITS
use diagnostics
use ifiles, only: ifile_t
use ifiles, only: line_p, line_is_associated, line_init, line_final
use ifiles, only: line_get_string_advance
<<Standard module head>>
<<Lexer: public>>
<<Lexer: parameters>>
<<Lexer: types>>
<<Lexer: interfaces>>
contains
<<Lexer: procedures>>
end module lexers
@ %def lexers
@
\subsection{Input streams}
For flexible input, we define a generic stream type that refers to
either an external file, an external unit which is already open, a
string, an [[ifile]] object (internal file, i.e., string list), or a
line pointer to an [[ifile]] object. The stream type actually follows
the idea of a formatted external file, which is line-oriented. Thus,
the stream reader always returns a whole record (input line).
Note that only in the string version, the stream contents are stored
inside the stream object. In the [[ifile]] version, the stream
contains only the line pointer, while in the external-file case, the
line pointer is implicitly created by the runtime library.
<<Lexer: public>>=
public :: stream_t
<<Lexer: types>>=
type :: stream_t
type(string_t), pointer :: filename => null ()
integer, pointer :: unit => null ()
type(string_t), pointer :: string => null ()
type(ifile_t), pointer :: ifile => null ()
type(line_p), pointer :: line => null ()
integer :: record = 0
logical :: eof = .false.
contains
<<Lexer: stream: TBP>>
end type stream_t
@ %def stream_t
@ The initializers refer to the specific version. The stream should
be undefined before calling this.
<<Lexer: public>>=
public :: stream_init
<<Lexer: stream: TBP>>=
generic :: init => &
stream_init_filename, &
stream_init_unit, &
stream_init_string, &
stream_init_ifile, &
stream_init_line
procedure, private :: stream_init_filename
procedure, private :: stream_init_unit
procedure, private :: stream_init_string
procedure, private :: stream_init_ifile
procedure, private :: stream_init_line
<<Lexer: interfaces>>=
interface stream_init
module procedure stream_init_filename
module procedure stream_init_unit
module procedure stream_init_string
module procedure stream_init_ifile
module procedure stream_init_line
end interface
<<Lexer: procedures>>=
subroutine stream_init_filename (stream, filename)
class(stream_t), intent(out) :: stream
character(*), intent(in) :: filename
integer :: unit
unit = free_unit ()
open (unit=unit, file=filename, status="old", action="read")
call stream_init_unit (stream, unit)
allocate (stream%filename)
stream%filename = filename
end subroutine stream_init_filename
subroutine stream_init_unit (stream, unit)
class(stream_t), intent(out) :: stream
integer, intent(in) :: unit
allocate (stream%unit)
stream%unit = unit
stream%eof = .false.
end subroutine stream_init_unit
subroutine stream_init_string (stream, string)
class(stream_t), intent(out) :: stream
type(string_t), intent(in) :: string
allocate (stream%string)
stream%string = string
end subroutine stream_init_string
subroutine stream_init_ifile (stream, ifile)
class(stream_t), intent(out) :: stream
type(ifile_t), intent(in) :: ifile
type(line_p) :: line
call line_init (line, ifile)
call stream_init_line (stream, line)
allocate (stream%ifile)
stream%ifile = ifile
end subroutine stream_init_ifile
subroutine stream_init_line (stream, line)
class(stream_t), intent(out) :: stream
type(line_p), intent(in) :: line
allocate (stream%line)
stream%line = line
end subroutine stream_init_line
@ %def stream_init
@ The finalizer restores the initial state. If an external file was
opened, it is closed.
<<Lexer: public>>=
public :: stream_final
<<Lexer: stream: TBP>>=
procedure :: final => stream_final
<<Lexer: procedures>>=
subroutine stream_final (stream)
class(stream_t), intent(inout) :: stream
if (associated (stream%filename)) then
close (stream%unit)
deallocate (stream%unit)
deallocate (stream%filename)
else if (associated (stream%unit)) then
deallocate (stream%unit)
else if (associated (stream%string)) then
deallocate (stream%string)
else if (associated (stream%ifile)) then
call line_final (stream%line)
deallocate (stream%line)
deallocate (stream%ifile)
else if (associated (stream%line)) then
call line_final (stream%line)
deallocate (stream%line)
end if
end subroutine stream_final
@ %def stream_final
@ This returns the next record from the input stream. Depending on
the stream type, the stream pointers are modified: Reading from
external unit, the external file is advanced (implicitly). Reading
from string, the string is replaced by an empty string. Reading from
[[ifile]], the line pointer is advanced. Note that the [[iostat]]
argument is mandatory.
<<Lexer: public>>=
public :: stream_get_record
<<Lexer: procedures>>=
subroutine stream_get_record (stream, string, iostat)
type(stream_t), intent(inout) :: stream
type(string_t), intent(out) :: string
integer, intent(out) :: iostat
if (associated (stream%unit)) then
if (stream%eof) then
iostat = EOF
else
call get (stream%unit, string, iostat=iostat)
if (iostat == EOR) then
iostat = 0
stream%record = stream%record + 1
end if
if (iostat == EOF) then
iostat = 0
stream%eof = .true.
if (len (string) /= 0) stream%record = stream%record + 1
end if
end if
else if (associated (stream%string)) then
if (len (stream%string) /= 0) then
string = stream%string
stream%string = ""
iostat = 0
stream%record = stream%record + 1
else
string = ""
iostat = EOF
end if
else if (associated (stream%line)) then
if (line_is_associated (stream%line)) then
string = line_get_string_advance (stream%line)
iostat = 0
stream%record = stream%record + 1
else
string = ""
iostat = EOF
end if
else
call msg_bug (" Attempt to read from uninitialized input stream")
end if
end subroutine stream_get_record
@ %def stream_get_record
@ Return the current stream source as a message string.
<<Lexer: public>>=
public :: stream_get_source_info_string
<<Lexer: procedures>>=
function stream_get_source_info_string (stream) result (string)
type(string_t) :: string
type(stream_t), intent(in) :: stream
character(20) :: buffer
if (associated (stream%filename)) then
string = "File '" // stream%filename // "' (unit = "
write (buffer, "(I0)") stream%unit
string = string // trim (buffer) // ")"
else if (associated (stream%unit)) then
write (buffer, "(I0)") stream%unit
string = "Unit " // trim (buffer)
else if (associated (stream%string)) then
string = "Input string"
else if (associated (stream%ifile) .or. associated (stream%line)) then
string = "Internal file"
else
string = ""
end if
end function stream_get_source_info_string
@ %def stream_get_source_info_string
@ Return the index of the record just read as a message string.
<<Lexer: public>>=
public :: stream_get_record_info_string
<<Lexer: procedures>>=
function stream_get_record_info_string (stream) result (string)
type(string_t) :: string
type(stream_t), intent(in) :: stream
character(20) :: buffer
string = stream_get_source_info_string (stream)
if (string /= "") string = string // ", "
write (buffer, "(I0)") stream%record
string = string // "line " // trim (buffer)
end function stream_get_record_info_string
@ %def stream_get_record_info_string
@
\subsection{Keyword list}
The lexer should be capable of identifying a token as a known
keyword. To this end, we store a list of keywords:
<<Lexer: public>>=
public :: keyword_list_t
<<Lexer: types>>=
type :: keyword_entry_t
private
type(string_t) :: string
type(keyword_entry_t), pointer :: next => null ()
end type keyword_entry_t
type :: keyword_list_t
private
type(keyword_entry_t), pointer :: first => null ()
type(keyword_entry_t), pointer :: last => null ()
end type keyword_list_t
@ %def keyword_entry_t keyword_list_t
@ Add a new string to the keyword list, unless it is already there:
<<Lexer: public>>=
public :: keyword_list_add
<<Lexer: procedures>>=
subroutine keyword_list_add (keylist, string)
type(keyword_list_t), intent(inout) :: keylist
type(string_t), intent(in) :: string
type(keyword_entry_t), pointer :: k_entry_new
if (.not. keyword_list_contains (keylist, string)) then
allocate (k_entry_new)
k_entry_new%string = string
if (associated (keylist%first)) then
keylist%last%next => k_entry_new
else
keylist%first => k_entry_new
end if
keylist%last => k_entry_new
end if
end subroutine keyword_list_add
@ %def keyword_list_add
@ Return true if a string is a keyword.
<<Lexer: public>>=
public :: keyword_list_contains
<<Lexer: procedures>>=
function keyword_list_contains (keylist, string) result (found)
type(keyword_list_t), intent(in) :: keylist
type(string_t), intent(in) :: string
logical :: found
found = .false.
call check_rec (keylist%first)
contains
recursive subroutine check_rec (k_entry)
type(keyword_entry_t), pointer :: k_entry
if (associated (k_entry)) then
if (k_entry%string /= string) then
call check_rec (k_entry%next)
else
found = .true.
end if
end if
end subroutine check_rec
end function keyword_list_contains
@ %def keyword_list_contains
@ Write the keyword list
<<Lexer: public>>=
public :: keyword_list_write
<<Lexer: interfaces>>=
interface keyword_list_write
module procedure keyword_list_write_unit
end interface
<<Lexer: procedures>>=
subroutine keyword_list_write_unit (keylist, unit)
type(keyword_list_t), intent(in) :: keylist
integer, intent(in) :: unit
write (unit, "(A)") "Keyword list:"
if (associated (keylist%first)) then
call keyword_write_rec (keylist%first)
write (unit, *)
else
write (unit, "(1x,A)") "[empty]"
end if
contains
recursive subroutine keyword_write_rec (k_entry)
type(keyword_entry_t), intent(in), pointer :: k_entry
if (associated (k_entry)) then
write (unit, "(1x,A)", advance="no") char (k_entry%string)
call keyword_write_rec (k_entry%next)
end if
end subroutine keyword_write_rec
end subroutine keyword_list_write_unit
@ %def keyword_list_write
@ Clear the keyword list
<<Lexer: public>>=
public :: keyword_list_final
<<Lexer: procedures>>=
subroutine keyword_list_final (keylist)
type(keyword_list_t), intent(inout) :: keylist
call keyword_destroy_rec (keylist%first)
nullify (keylist%last)
contains
recursive subroutine keyword_destroy_rec (k_entry)
type(keyword_entry_t), pointer :: k_entry
if (associated (k_entry)) then
call keyword_destroy_rec (k_entry%next)
deallocate (k_entry)
end if
end subroutine keyword_destroy_rec
end subroutine keyword_list_final
@ %def keyword_list_final
@
\subsection{Lexeme templates}
This type is handled like a rudimentary regular expression. It
determines the lexer behavior when matching a string. The actual
objects made from this type and the corresponding matching routines
are listed below.
<<Lexer: types>>=
type :: template_t
private
integer :: type
character(256) :: charset1, charset2
integer :: len1, len2
end type template_t
@ %def template_t
@ These are the types that valid lexemes can have:
<<Lexer: public>>=
public :: T_KEYWORD, T_IDENTIFIER, T_QUOTED, T_NUMERIC
<<Lexer: parameters>>=
integer, parameter :: T_KEYWORD = 1
integer, parameter :: T_IDENTIFIER = 2, T_QUOTED = 3, T_NUMERIC = 4
@ %def T_KEYWORD T_IDENTIFIER T_QUOTED T_NUMERIC
@ These are special types:
<<Lexer: parameters>>=
integer, parameter :: EMPTY = 0, WHITESPACE = 10
integer, parameter :: NO_MATCH = 11, IO_ERROR = 12, OVERFLOW = 13
integer, parameter :: UNMATCHED_QUOTE = 14
@ %def EMPTY WHITESPACE NO_MATCH IO_ERROR OVERFLOW UNMATCHED_QUOTE
@ In addition, we have [[EOF]] which is a negative integer, normally $-1$.
@ Printout for debugging:
<<Lexer: procedures>>=
subroutine lexeme_type_write (type, unit)
integer, intent(in) :: type
integer, intent(in) :: unit
select case (type)
case (EMPTY); write(unit,"(A)",advance="no") " EMPTY "
case (WHITESPACE); write(unit,"(A)",advance="no") " WHITESPACE "
case (T_IDENTIFIER);write(unit,"(A)",advance="no") " IDENTIFIER "
case (T_QUOTED); write(unit,"(A)",advance="no") " QUOTED "
case (T_NUMERIC); write(unit,"(A)",advance="no") " NUMERIC "
case (IO_ERROR); write(unit,"(A)",advance="no") " IO_ERROR "
case (OVERFLOW); write(unit,"(A)",advance="no") " OVERFLOW "
case (UNMATCHED_QUOTE); write(unit,"(A)",advance="no") " UNMATCHEDQ "
case (NO_MATCH); write(unit,"(A)",advance="no") " NO_MATCH "
case (EOF); write(unit,"(A)",advance="no") " EOF "
case default; write(unit,"(A)",advance="no") " [illegal] "
end select
end subroutine lexeme_type_write
subroutine template_write (tt, unit)
type(template_t), intent(in) :: tt
integer, intent(in) :: unit
call lexeme_type_write (tt%type, unit)
write (unit, "(A)", advance="no") "'" // tt%charset1(1:tt%len1) // "'"
write (unit, "(A)", advance="no") " '" // tt%charset2(1:tt%len2) // "'"
end subroutine template_write
@ %def template_write
@ The matching functions all return the number of matched characters in
the provided string. If this number is zero, the match has failed.
The [[template]] functions are declared [[pure]] because they appear
in [[forall]] loops below.
A template for whitespace:
<<Lexer: procedures>>=
pure function template_whitespace (chars) result (tt)
character(*), intent(in) :: chars
type(template_t) :: tt
tt = template_t (WHITESPACE, chars, "", len (chars), 0)
end function template_whitespace
@ %def template_whitespace
@ Just match the string against the character set.
<<Lexer: procedures>>=
subroutine match_whitespace (tt, s, n)
type(template_t), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
n = verify (s, tt%charset1(1:tt%len1)) - 1
if (n < 0) n = len (s)
end subroutine match_whitespace
@ %def match_whitespace
@ A template for normal identifiers. To match, a lexeme should have a
first character in class [[chars1]] and an arbitrary number of further
characters in class [[chars2]]. If the latter is empty, we are
looking for a single-character lexeme.
<<Lexer: procedures>>=
pure function template_identifier (chars1, chars2) result (tt)
character(*), intent(in) :: chars1, chars2
type(template_t) :: tt
tt = template_t (T_IDENTIFIER, chars1, chars2, len(chars1), len(chars2))
end function template_identifier
@ %def template_identifier
@ Here, the first letter must match, the others may or may not.
<<Lexer: procedures>>=
subroutine match_identifier (tt, s, n)
type(template_t), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
if (verify (s(1:1), tt%charset1(1:tt%len1)) == 0) then
n = verify (s(2:), tt%charset2(1:tt%len2))
if (n == 0) n = len (s)
else
n = 0
end if
end subroutine match_identifier
@ %def match_identifier
@ A template for quoted strings. The same template applies for
comments. The first character set indicates the
left quote (could be a sequence of several characters), the second one
the matching right quote.
<<Lexer: procedures>>=
pure function template_quoted (chars1, chars2) result (tt)
character(*), intent(in) :: chars1, chars2
type(template_t) :: tt
tt = template_t (T_QUOTED, chars1, chars2, len (chars1), len (chars2))
end function template_quoted
@ %def template_quoted
@ Here, the beginning of the string must exactly match the first
character set, then we look for the second one. If found, return.
If there is a first quote but no second one, return a negative number,
indicating this error condition.
<<Lexer: procedures>>=
subroutine match_quoted (tt, s, n, range)
type(template_t), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
integer, dimension(2), intent(out) :: range
character(tt%len1) :: ch1
character(tt%len2) :: ch2
integer :: i
ch1 = tt%charset1
if (s(1:tt%len1) == ch1) then
ch2 = tt%charset2
do i = tt%len1 + 1, len (s) - tt%len2 + 1
if (s(i:i+tt%len2-1) == ch2) then
n = i + tt%len2 - 1
range(1) = tt%len1 + 1
range(2) = i - 1
return
end if
end do
n = -1
range = 0
else
n = 0
range = 0
end if
end subroutine match_quoted
@ %def match_quoted
@ A template for real numbers. The first character set is the set of
allowed exponent letters. In accordance with the other functions we
return the lexeme as a string but do not read it.
<<Lexer: procedures>>=
pure function template_numeric (chars) result (tt)
character(*), intent(in) :: chars
type(template_t) :: tt
tt = template_t (T_NUMERIC, chars, "", len (chars), 0)
end function template_numeric
@ %def template_numeric
@ A numeric lexeme may be real or integer. We purposely do not allow
for a preceding sign. If the number is followed by an exponent, this
is included, otherwise the rest is ignored.
There is a possible pitfall with this behavior: while the string
[[1e3]] will be interpreted as a single number, the analogous string
[[1a3]] will be split into the number [[1]] and an identifier [[a3]].
There is no easy way around such an ambiguity. We should make sure
that the syntax does not contain identifiers like [[a3]] or [[e3]].
<<Lexer: procedures>>=
subroutine match_numeric (tt, s, n)
type(template_t), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
integer :: i, n0
character(10), parameter :: digits = "0123456789"
character(2), parameter :: signs = "-+"
n = verify (s, digits) - 1
if (n < 0) then
n = 0
return
else if (s(n+1:n+1) == ".") then
i = verify (s(n+2:), digits) - 1
if (i < 0) then
n = len (s)
return
else if (i > 0 .or. n > 0) then
n = n + 1 + i
end if
end if
n0 = n
if (n > 0) then
if (verify (s(n+1:n+1), tt%charset1(1:tt%len1)) == 0) then
n = n + 1
if (verify (s(n+1:n+1), signs) == 0) n = n + 1
i = verify (s(n+1:), digits) - 1
if (i < 0) then
n = len (s)
else if (i == 0) then
n = n0
else
n = n + i
end if
end if
end if
end subroutine match_numeric
@ %def match_numeric
@ The generic matching routine. With Fortran 2003 we would define
separate types and use a SELECT TYPE instead.
<<Lexer: procedures>>=
subroutine match_template (tt, s, n, range)
type(template_t), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
integer, dimension(2), intent(out) :: range
select case (tt%type)
case (WHITESPACE)
call match_whitespace (tt, s, n)
range = 0
case (T_IDENTIFIER)
call match_identifier (tt, s, n)
range(1) = 1
range(2) = len_trim (s)
case (T_QUOTED)
call match_quoted (tt, s, n, range)
case (T_NUMERIC)
call match_numeric (tt, s, n)
range(1) = 1
range(2) = len_trim (s)
case default
call msg_bug ("Invalid lexeme template encountered")
end select
end subroutine match_template
@ %def match_template
@ Match against an array of templates. Return the index of the first
template that matches together with the number of characters matched
and the range of the relevant substring. If all fails, these numbers
are zero.
<<Lexer: procedures>>=
subroutine match (tt, s, n, range, ii)
type(template_t), dimension(:), intent(in) :: tt
character(*), intent(in) :: s
integer, intent(out) :: n
integer, dimension(2), intent(out) :: range
integer, intent(out) :: ii
integer :: i
do i = 1, size (tt)
call match_template (tt(i), s, n, range)
if (n /= 0) then
ii = i
return
end if
end do
n = 0
ii = 0
end subroutine match
@ %def match
@
\subsection{The lexer setup}
This object contains information about character classes. As said
above, one class consists of quoting chars (matching left and right),
another one of comment chars (similar), a class of whitespace, and
several classes of characters that make up identifiers. When creating
the lexer setup, the character classes are transformed into lexeme
templates which are to be matched in a certain predefined order
against the input stream.
BLANK should always be taken as whitespace, some things may depend on
this. TAB is also fixed by convention, but may in principle be
modified. Newline (DOS!) and linefeed are also defined as whitespace.
@
The lexer setup, containing the list of lexeme templates. No
defaults yet. The type with index zero will be assigned to the
[[NO_MATCH]] lexeme.
The keyword list is not stored, just a pointer to it. We anticipate
that the keyword list is part of the syntax table, and the lexer needs
not alter it. Furthermore, the lexer is typically finished before the
syntax table is.
<<Lexer: parameters>>=
integer, parameter :: CASE_KEEP = 0, CASE_UP = 1, CASE_DOWN = 2
@ %def CASE_KEEP CASE_UP CASE_DOWN
<<Lexer: types>>=
type :: lexer_setup_t
private
type(template_t), dimension(:), allocatable :: tt
integer, dimension(:), allocatable :: type
integer :: keyword_case = CASE_KEEP
type(keyword_list_t), pointer :: keyword_list => null ()
end type lexer_setup_t
@ %def lexer_setup
@ Fill the lexer setup object. Some things are hardcoded here
(whitespace, alphanumeric identifiers), some are free: comment chars
(but these must be single, and comments must be terminated by
line-feed), quote chars and matches (must be single), characters to be
read as one-character lexeme, special classes (characters of one class
that should be glued together as identifiers).
<<Lexer: procedures>>=
subroutine lexer_setup_init (setup, &
comment_chars, quote_chars, quote_match, &
single_chars, special_class, &
keyword_list, upper_case_keywords)
type(lexer_setup_t), intent(inout) :: setup
character(*), intent(in) :: comment_chars
character(*), intent(in) :: quote_chars, quote_match
character(*), intent(in) :: single_chars
character(*), dimension(:), intent(in) :: special_class
type(keyword_list_t), pointer :: keyword_list
logical, intent(in), optional :: upper_case_keywords
integer :: n, i
if (present (upper_case_keywords)) then
if (upper_case_keywords) then
setup%keyword_case = CASE_UP
else
setup%keyword_case = CASE_DOWN
end if
else
setup%keyword_case = CASE_KEEP
end if
n = 1 + len (comment_chars) + len (quote_chars) + 1 &
+ len (single_chars) + size (special_class) + 1
allocate (setup%tt(n))
allocate (setup%type(0:n))
n = 0
setup%type(n) = NO_MATCH
n = n + 1
setup%tt(n) = template_whitespace (WHITESPACE_CHARS)
setup%type(n) = EMPTY
forall (i = 1:len(comment_chars))
setup%tt(n+i) = template_quoted (comment_chars(i:i), LF)
setup%type(n+i) = EMPTY
end forall
n = n + len (comment_chars)
forall (i = 1:len(quote_chars))
setup%tt(n+i) = template_quoted (quote_chars(i:i), quote_match(i:i))
setup%type(n+i) = T_QUOTED
end forall
n = n + len (quote_chars)
setup%tt(n+1) = template_numeric ("EeDd")
setup%type(n+1) = T_NUMERIC
n = n + 1
forall (i = 1:len (single_chars))
setup%tt(n+i) = template_identifier (single_chars(i:i), "")
setup%type(n+i) = T_IDENTIFIER
end forall
n = n + len (single_chars)
forall (i = 1:size (special_class))
setup%tt(n+i) = template_identifier &
(trim (special_class(i)), trim (special_class(i)))
setup%type(n+i) = T_IDENTIFIER
end forall
n = n + size (special_class)
setup%tt(n+1) = template_identifier &
(LCLETTERS//UCLETTERS, LCLETTERS//DIGITS//"_"//UCLETTERS)
setup%type(n+1) = T_IDENTIFIER
n = n + 1
if (n /= size (setup%tt)) &
call msg_bug ("Size mismatch in lexer setup")
setup%keyword_list => keyword_list
end subroutine lexer_setup_init
@ %def lexer_setup_init
@ The destructor is needed only if the object is not itself part of an
allocatable array
<<Lexer: procedures>>=
subroutine lexer_setup_final (setup)
type(lexer_setup_t), intent(inout) :: setup
deallocate (setup%tt, setup%type)
setup%keyword_list => null ()
end subroutine lexer_setup_final
@ %def lexer_setup_final
@ For debugging: Write the lexer setup
<<Lexer: procedures>>=
subroutine lexer_setup_write (setup, unit)
type(lexer_setup_t), intent(in) :: setup
integer, intent(in) :: unit
integer :: i
write (unit, "(A)") "Lexer setup:"
if (allocated (setup%tt)) then
do i = 1, size (setup%tt)
call template_write (setup%tt(i), unit)
write (unit, '(A)', advance = "no") " -> "
call lexeme_type_write (setup%type(i), unit)
write (unit, *)
end do
else
write (unit, *) "[empty]"
end if
if (associated (setup%keyword_list)) then
call keyword_list_write (setup%keyword_list, unit)
end if
end subroutine lexer_setup_write
@ %def lexer_setup_write
@
\subsection{The lexeme type}
An object of this type is returned by the lexer. Apart from the lexeme
string, it gives information about the relevant substring (first and
last character index) and the lexeme type. Interpreting the string is
up to the parser.
<<Lexer: public>>=
public :: lexeme_t
<<Lexer: types>>=
type :: lexeme_t
private
integer :: type = EMPTY
type(string_t) :: s
integer :: b = 0, e = 0
end type lexeme_t
@ %def lexeme_t
@ Debugging aid:
<<Lexer: public>>=
public :: lexeme_write
<<Lexer: procedures>>=
subroutine lexeme_write (t, unit)
type(lexeme_t), intent(in) :: t
integer, intent(in) :: unit
integer :: u
u = given_output_unit (unit); if (u < 0) return
select case (t%type)
case (T_KEYWORD)
write (u, *) "KEYWORD: '" // char (t%s) // "'"
case (T_IDENTIFIER)
write (u, *) "IDENTIFIER: '" // char (t%s) // "'"
case (T_QUOTED)
write (u, *) "QUOTED: '" // char (t%s) // "'"
case (T_NUMERIC)
write (u, *) "NUMERIC: '" // char (t%s) // "'"
case (UNMATCHED_QUOTE)
write (u, *) "Unmatched quote: "// char (t%s)
case (OVERFLOW); write (u, *) "Overflow: "// char (t%s)
case (EMPTY); write (u, *) "Empty lexeme"
case (NO_MATCH); write (u, *) "No match"
case (IO_ERROR); write (u, *) "IO error"
case (EOF); write (u, *) "EOF"
case default
write (u, *) "Error"
end select
end subroutine lexeme_write
@ %def lexeme_write
@ Store string and type in a lexeme. The range determines the
beginning and end of the relevant part of the string. Check for a
keyword.
<<Lexer: procedures>>=
subroutine lexeme_set (t, keyword_list, s, range, type, keyword_case)
type(lexeme_t), intent(out) :: t
type(keyword_list_t), pointer :: keyword_list
type(string_t), intent(in) :: s
type(string_t) :: keyword
integer, dimension(2), intent(in) :: range
integer, intent(in) :: type
integer, intent(in), optional :: keyword_case
t%type = type
if (present (keyword_case)) then
select case (keyword_case)
case (CASE_KEEP); keyword = s
case (CASE_UP); keyword = upper_case (s)
case (CASE_DOWN); keyword = lower_case (s)
end select
else
keyword = s
end if
if (type == T_IDENTIFIER) then
if (associated (keyword_list)) then
if (keyword_list_contains (keyword_list, keyword)) &
t%type = T_KEYWORD
end if
end if
select case (t%type)
case (T_KEYWORD); t%s = keyword
case default; t%s = s
end select
t%b = range(1)
t%e = range(2)
end subroutine lexeme_set
subroutine lexeme_clear (t)
type(lexeme_t), intent(out) :: t
t%type = EMPTY
t%s = ""
end subroutine lexeme_clear
@ %def lexeme_set lexeme_clear
@ Retrieve the lexeme string, the relevant part of it, and the type.
The last function returns true if there is a break condition reached
(error or EOF).
<<Lexer: public>>=
public :: lexeme_get_string
public :: lexeme_get_contents
public :: lexeme_get_delimiters
public :: lexeme_get_type
<<Lexer: procedures>>=
function lexeme_get_string (t) result (s)
type(string_t) :: s
type(lexeme_t), intent(in) :: t
s = t%s
end function lexeme_get_string
function lexeme_get_contents (t) result (s)
type(string_t) :: s
type(lexeme_t), intent(in) :: t
s = extract (t%s, t%b, t%e)
end function lexeme_get_contents
function lexeme_get_delimiters (t) result (del)
type(string_t), dimension(2) :: del
type(lexeme_t), intent(in) :: t
del(1) = extract (t%s, finish = t%b-1)
del(2) = extract (t%s, start = t%e+1)
end function lexeme_get_delimiters
function lexeme_get_type (t) result (type)
integer :: type
type(lexeme_t), intent(in) :: t
type = t%type
end function lexeme_get_type
@ %def lexeme_get_string lexeme_get_contents lexeme_get_type
@ Check for a generic break condition (error/eof) and for eof in
particular.
<<Lexer: public>>=
public :: lexeme_is_break
public :: lexeme_is_eof
<<Lexer: procedures>>=
function lexeme_is_break (t) result (break)
logical :: break
type(lexeme_t), intent(in) :: t
select case (t%type)
case (EOF, IO_ERROR, OVERFLOW, NO_MATCH)
break = .true.
case default
break = .false.
end select
end function lexeme_is_break
function lexeme_is_eof (t) result (ok)
logical :: ok
type(lexeme_t), intent(in) :: t
ok = t%type == EOF
end function lexeme_is_eof
@ %def lexeme_is_break lexeme_is_eof
@
\subsection{The lexer object}
We store the current lexeme and the current line. The line buffer is
set each time a new line is read from file. The working buffer has
one character more, to hold any trailing blank. Pointers to line and
column are for debugging, they will be used to make up readable error
messages for the parser.
<<Lexer: public>>=
public :: lexer_t
<<Lexer: types>>=
type :: lexer_t
private
type(lexer_setup_t) :: setup
type(stream_t), pointer :: stream => null ()
type(lexeme_t) :: lexeme
type(string_t) :: previous_line2
type(string_t) :: previous_line1
type(string_t) :: current_line
integer :: lines_read = 0
integer :: current_column = 0
integer :: previous_column = 0
type(string_t) :: buffer
type(lexer_t), pointer :: parent => null ()
contains
<<Lexer: lexer: TBP>>
end type lexer_t
@ %def lexer_t
@ Create-setup wrapper
<<Lexer: public>>=
public :: lexer_init
<<Lexer: lexer: TBP>>=
procedure :: init => lexer_init
<<Lexer: procedures>>=
subroutine lexer_init (lexer, &
comment_chars, quote_chars, quote_match, &
single_chars, special_class, &
keyword_list, upper_case_keywords, &
parent)
class(lexer_t), intent(inout) :: lexer
character(*), intent(in) :: comment_chars
character(*), intent(in) :: quote_chars, quote_match
character(*), intent(in) :: single_chars
character(*), dimension(:), intent(in) :: special_class
type(keyword_list_t), pointer :: keyword_list
logical, intent(in), optional :: upper_case_keywords
type(lexer_t), target, intent(in), optional :: parent
call lexer_setup_init (lexer%setup, &
comment_chars = comment_chars, &
quote_chars = quote_chars, &
quote_match = quote_match, &
single_chars = single_chars, &
special_class = special_class, &
keyword_list = keyword_list, &
upper_case_keywords = upper_case_keywords)
if (present (parent)) lexer%parent => parent
call lexer_clear (lexer)
end subroutine lexer_init
@ %def lexer_init
@ Clear the lexer state, but not the setup. This should be done when
the lexing starts, but it is not known whether the lexer was used
before.
<<Lexer: public>>=
public :: lexer_clear
<<Lexer: lexer: TBP>>=
procedure :: clear => lexer_clear
<<Lexer: procedures>>=
subroutine lexer_clear (lexer)
class(lexer_t), intent(inout) :: lexer
call lexeme_clear (lexer%lexeme)
lexer%previous_line2 = ""
lexer%previous_line1 = ""
lexer%current_line = ""
lexer%lines_read = 0
lexer%current_column = 0
lexer%previous_column = 0
lexer%buffer = ""
end subroutine lexer_clear
@ %def lexer_clear
@ Reset lexer state and delete setup
<<Lexer: public>>=
public :: lexer_final
<<Lexer: lexer: TBP>>=
procedure :: final => lexer_final
<<Lexer: procedures>>=
subroutine lexer_final (lexer)
class(lexer_t), intent(inout) :: lexer
call lexer%clear ()
call lexer_setup_final (lexer%setup)
end subroutine lexer_final
@ %def lexer_final
@
\subsection{The lexer routine}
For lexing we need to associate an input stream to the lexer.
<<Lexer: public>>=
public :: lexer_assign_stream
<<Lexer: lexer: TBP>>=
procedure :: assign_stream => lexer_assign_stream
<<Lexer: procedures>>=
subroutine lexer_assign_stream (lexer, stream)
class(lexer_t), intent(inout) :: lexer
type(stream_t), intent(in), target :: stream
lexer%stream => stream
end subroutine lexer_assign_stream
@ %def lexer_assign_stream
@
The lexer. The lexer function takes the lexer and returns the
currently stored lexeme. If there is none, it is read from buffer,
matching against the lexeme templates in the lexer setup. Empty
lexemes, i.e., comments and whitespace, are discarded and the buffer
is read again until we have found a nonempty lexeme (which may also be
EOF or an error condition).
The initial state of the lexer contains an empty lexeme, so reading
from buffer is forced. The empty state is restored after returning
the lexeme. A nonempty lexeme is present in the lexer only if
[[lex_back]] has been executed before.
The workspace is the [[lexer%buffer]], treated as a sort of input
stream. We chop off lexemes from the beginning, adjusting the buffer
to the left. Whenever the buffer is empty, or we are matching against
an open quote which has not terminated, we read a new line and append
it to the right. This may result in special conditions, which for
simplicity are also returned as lexemes: I/O error, buffer overflow,
end of file. If the latter happens during reading a quoted string,
we return an unmatched-quote lexeme. Obviously, the special-condition
lexemes have to be caught by the parser.
Note that reading further lines is only necessary when reading a
quoted string. Otherwise, the line-feed that ends each line is
interpreted as whitespace which terminates a preceding lexeme, so there
are no other valid multiline lexemes.
To enable meaningful error messages, we also keep track of the line
number of the last line read, and the beginning and the end of the
current lexeme with respect to this line.
The lexer is implemented as a function that returns the next lexeme
(i.e., token). It uses the [[lexer]] setup and modifies the buffers
and pointers stored within the lexer, a side effect. The lexer reads
from an input stream object, which also is modified by this reading,
e.g., a line pointer is advanced.
<<Lexer: public>>=
public :: lex
<<Lexer: procedures>>=
subroutine lex (lexeme, lexer)
type(lexeme_t), intent(out) :: lexeme
type(lexer_t), intent(inout) :: lexer
integer :: iostat1, iostat2
integer :: pos
integer, dimension(2) :: range
integer :: template_index, type
if (.not. associated (lexer%stream)) &
call msg_bug ("Lexer called without assigned stream")
GET_LEXEME: do while (lexeme_get_type (lexer%lexeme) == EMPTY)
if (len (lexer%buffer) /= 0) then
iostat1 = 0
else
call lexer_read_line (lexer, iostat1)
end if
select case (iostat1)
case (0)
MATCH_BUFFER: do
call match (lexer%setup%tt, char (lexer%buffer), &
pos, range, template_index)
if (pos >= 0) then
type = lexer%setup%type(template_index)
exit MATCH_BUFFER
else
pos = 0
call lexer_read_line (lexer, iostat2)
select case (iostat2)
case (EOF); type = UNMATCHED_QUOTE; exit MATCH_BUFFER
case (1); type = IO_ERROR; exit MATCH_BUFFER
case (2); type = OVERFLOW; exit MATCH_BUFFER
end select
end if
end do MATCH_BUFFER
case (EOF); type = EOF
case (1); type = IO_ERROR
case (2); type = OVERFLOW
end select
call lexeme_set (lexer%lexeme, lexer%setup%keyword_list, &
extract (lexer%buffer, finish=pos), range, type, &
lexer%setup%keyword_case)
lexer%buffer = remove (lexer%buffer, finish=pos)
lexer%previous_column = lexer%current_column
lexer%current_column = lexer%current_column + pos
end do GET_LEXEME
lexeme = lexer%lexeme
call lexeme_clear (lexer%lexeme)
end subroutine lex
@ %def lex
@ Read a line and append it to the input buffer. If the input buffer
overflows, return [[iostat=2]]. Otherwise, [[iostat=1]] indicates an
I/O error, and [[iostat=-1]] the EOF.
The input stream may either be an external unit or a [[ifile]]
object. In the latter case, a line is read and the line pointer is
advanced.
Note that inserting [[LF]] between input lines is the Unix convention.
Since we are doing this explicitly when gluing lines together, we can
pattern-match against [[LF]] without having to worry about the system.
<<Lexer: procedures>>=
subroutine lexer_read_line (lexer, iostat)
type(lexer_t), intent(inout) :: lexer
integer, intent(out) :: iostat
type(string_t) :: current_line
current_line = lexer%current_line
call stream_get_record (lexer%stream, lexer%current_line, iostat)
if (iostat == 0) then
lexer%lines_read = lexer%lines_read + 1
lexer%previous_line2 = lexer%previous_line1
lexer%previous_line1 = current_line
lexer%buffer = lexer%buffer // lexer%current_line // LF
lexer%previous_column = 0
lexer%current_column = 0
end if
end subroutine lexer_read_line
@ %def lexer_read_line
@ Once in a while we have read one lexeme to many, which can be
pushed back into the input stream. Do not do this more than once.
<<Lexer: public>>=
public :: lexer_put_back
<<Lexer: procedures>>=
subroutine lexer_put_back (lexer, lexeme)
type(lexer_t), intent(inout) :: lexer
type(lexeme_t), intent(in) :: lexeme
if (lexeme_get_type (lexer%lexeme) == EMPTY) then
lexer%lexeme = lexeme
else
call msg_bug (" Lexer: lex_back fails; probably called twice")
end if
end subroutine lexer_put_back
@ %def lexer_put__back
@
\subsection{Diagnostics}
For debugging: print just the setup
<<Lexer: public>>=
public :: lexer_write_setup
<<Lexer: procedures>>=
subroutine lexer_write_setup (lexer, unit)
type(lexer_t), intent(in) :: lexer
integer, intent(in), optional :: unit
integer :: u
u = given_output_unit (unit); if (u < 0) return
call lexer_setup_write (lexer%setup, u)
end subroutine lexer_write_setup
@ %def lexer_write_setup
@
This is useful for error printing: show the current line with index
and a pointer to the current column within the line.
<<Lexer: public>>=
public :: lexer_show_location
<<Lexer: procedures>>=
subroutine lexer_show_location (lexer)
type(lexer_t), intent(in) :: lexer
type(string_t) :: loc_str
if (associated (lexer%parent)) then
call lexer_show_source (lexer%parent)
call msg_message ("[includes]")
else
call msg_message ()
end if
if (associated (lexer%stream)) then
call msg_message &
(char (stream_get_record_info_string (lexer%stream)) // ":")
end if
if (lexer%lines_read >= 4) call msg_result ("[...]")
if (lexer%lines_read >= 3) call msg_result (char (lexer%previous_line2))
if (lexer%lines_read >= 2) call msg_result (char (lexer%previous_line1))
if (lexer%lines_read >= 1) then
call msg_result (char (lexer%current_line))
loc_str = repeat (" ", lexer%previous_column)
loc_str = loc_str // "^"
if (lexer%current_column > lexer%previous_column) then
loc_str = loc_str &
// repeat ("-", max (lexer%current_column &
- lexer%previous_column - 1, 0)) &
// "^"
end if
call msg_result (char (loc_str))
end if
end subroutine lexer_show_location
@ %def lexer_show_location
@ This just prints the current stream source.
<<Lexer: procedures>>=
recursive subroutine lexer_show_source (lexer)
type(lexer_t), intent(in) :: lexer
if (associated (lexer%parent)) then
call lexer_show_source (lexer%parent)
call msg_message ("[includes]")
else
call msg_message ()
end if
if (associated (lexer%stream)) then
call msg_message &
(char (stream_get_source_info_string (lexer%stream)) // ":")
end if
end subroutine lexer_show_source
@ %def lexer_show_source
@
\subsection{Unit tests}
Test module, followed by the corresponding implementation module.
<<[[lexers_ut.f90]]>>=
<<File header>>
module lexers_ut
use unit_tests
use lexers_uti
<<Standard module head>>
<<Lexer: public test>>
contains
<<Lexer: test driver>>
end module lexers_ut
@ %def lexers_ut
@
<<[[lexers_uti.f90]]>>=
<<File header>>
module lexers_uti
<<Use strings>>
use lexers
<<Standard module head>>
<<Lexer: test declarations>>
contains
<<Lexer: tests>>
end module lexers_uti
@ %def lexers_ut
@ API: driver for the unit tests below.
<<Lexer: public test>>=
public :: lexer_test
<<Lexer: test driver>>=
subroutine lexer_test (u, results)
integer, intent(in) :: u
type(test_results_t), intent(inout) :: results
<<Lexer: execute tests>>
end subroutine lexer_test
@ %def lexer_test
@ Test the lexer by lexing and printing all lexemes from unit [[u]],
one per line, using preset conventions.
<<Lexer: execute tests>>=
call test (lexer_1, "lexer_1", &
"check lexer", u, results)
<<Lexer: test declarations>>=
public :: lexer_1
<<Lexer: tests>>=
subroutine lexer_1 (u)
integer, intent(in) :: u
type(lexer_t), target :: lexer
type(stream_t), target :: stream
type(string_t) :: string
type(lexeme_t) :: lexeme
string = "abcdefghij"
call lexer_init (lexer, &
comment_chars = "", &
quote_chars = "<'""", &
quote_match = ">'""", &
single_chars = "?*+|=,()", &
special_class = ["."], &
keyword_list = null ())
call stream_init (stream, string)
call lexer_assign_stream (lexer, stream)
do
call lex (lexeme, lexer)
call lexeme_write (lexeme, u)
if (lexeme_is_break (lexeme)) exit
end do
call stream_final (stream)
call lexer_final (lexer)
end subroutine lexer_1
@ %def lexer_1
@
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Syntax rules}
This module provides tools to handle syntax rules in an abstract way.
<<[[syntax_rules.f90]]>>=
<<File header>>
module syntax_rules
<<Use strings>>
use io_units
use diagnostics
use system_defs, only: LCLETTERS, UCLETTERS, DIGITS
use ifiles, only: line_p, line_init, line_get_string_advance, line_final
use ifiles, only: ifile_t, ifile_get_length
use lexers
<<Standard module head>>
<<Syntax: public>>
<<Syntax: parameters>>
<<Syntax: types>>
<<Syntax: interfaces>>
contains
<<Syntax: subroutines>>
end module syntax_rules
@ %def syntax_rules
@
\subsection{Syntax rules}
Syntax rules are used by the parser. They determine how to translate
the stream of lexemes as returned by the lexer into the parse tree
node. A rule may be terminal, i.e., replace a matching lexeme into a
terminal node. The node will contain the lexeme interpreted as a
recognized token:
\begin{itemize}
\item a keyword: unquoted fixed character string;
\item a real number, to be determined at runtime;
\item an integer, to be determined at runtime;
\item a boolean value, to be determined at runtime;
\item a quoted token (e.g., string), to be determined at runtime;
\item an identifier (unquoted string that is not a recognized
keyword), to be determined at runtime.
\end{itemize}
It may be nonterminal, i.e., contain a sequence of child rules.
These are matched consecutively (and recursively) against the input
stream; the resulting node will be a branch node.
\begin{itemize}
\item the file, i.e., the input stream as a whole;
\item a sequence of syntax elements, where the last syntax element may be
optional, or optional repetitive;
\end{itemize}
Sequences carry a flag that tells whether the last child is optional
or may be repeated an arbitrary number of times, correponding to the
regexp modifiers [[?]], [[*]], and [[+]].
We also need an alternative rule; this will be replaced by the node
generated by one of its children that matches; thus, it does not
create a node of its own.
\begin{itemize}
\item an alternative of syntax elements.
\end{itemize}
We also define special types of sequences as convenience macros:
\begin{itemize}
\item a list: a sequence where the elements are separated by a
separator keyword (e.g., commas), the separators are thrown away
when parsing the list;
\item a group: a sequence of three tokens, where the first and third
ones are left and right delimiters, the delimiters are thrown away;
\item an argument list: a delimited list, containing both delimiters
and separators.
\end{itemize}
It would be great to have a polymorphic type for this purpose, but
until Fortran 2003 is out we have to emulate this.
Here are the syntax element codes:
<<Syntax: public>>=
public :: S_UNKNOWN
public :: S_LOGICAL, S_INTEGER, S_REAL, S_COMPLEX, S_QUOTED
public :: S_IDENTIFIER, S_KEYWORD
public :: S_SEQUENCE, S_LIST, S_GROUP, S_ARGS
public :: S_ALTERNATIVE
public :: S_IGNORE
<<Syntax: parameters>>=
integer, parameter :: &
S_UNKNOWN = 0, &
S_LOGICAL = 1, S_INTEGER = 2, S_REAL = 3, S_COMPLEX = 4, &
S_QUOTED = 5, S_IDENTIFIER = 6, S_KEYWORD = 7, &
S_SEQUENCE = 8, S_LIST = 9, S_GROUP = 10, S_ARGS = 11, &
S_ALTERNATIVE = 12, &
S_IGNORE = 99
@ We need arrays of rule pointers, therefore this construct.
<<Syntax: types>>=
type :: rule_p
private
type(syntax_rule_t), pointer :: p => null ()
end type rule_p
@ %def rule_p
@ Return the association status of the rule pointer:
<<Syntax: subroutines>>=
elemental function rule_is_associated (rp) result (ok)
logical :: ok
type (rule_p), intent(in) :: rp
ok = associated (rp%p)
end function rule_is_associated
@ %def rule_is_associated
@ The rule type is one of the types listed above, represented by an
integer code. The keyword, for a non-keyword rule, is an identifier
used for the printed syntax table. The array of children is needed
for nonterminal rules. In that case, there is a modifier for the last
element (blank, "?", "*", or "+"), mirrored in the flags [[opt]] and
[[rep]]. Then, we have the character constants used as separators and
delimiters for this rule. Finally, the [[used]] flag can be set to
indicate that this rule is the child of another rule.
<<Syntax: types>>=
public :: syntax_rule_t
<<Syntax: types>>=
type :: syntax_rule_t
private
integer :: type = S_UNKNOWN
logical :: used = .false.
type(string_t) :: keyword
type(string_t) :: separator
type(string_t), dimension(2) :: delimiter
type(rule_p), dimension(:), allocatable :: child
character(1) :: modifier = ""
logical :: opt = .false., rep = .false.
contains
<<Syntax: syntax rule: TBP>>
end type syntax_rule_t
@ %def syntax_rule_t
@ Initializer: Set type and key for a rule, but do not (yet) allocate
anything. Finalizer: not needed (no pointer components).
<<Syntax: subroutines>>=
subroutine syntax_rule_init (rule, key, type)
type(syntax_rule_t), intent(inout) :: rule
type(string_t), intent(in) :: key
integer, intent(in) :: type
rule%keyword = key
rule%type = type
select case (rule%type)
case (S_GROUP)
call syntax_rule_set_delimiter (rule)
case (S_LIST)
call syntax_rule_set_separator (rule)
case (S_ARGS)
call syntax_rule_set_delimiter (rule)
call syntax_rule_set_separator (rule)
end select
end subroutine syntax_rule_init
@ %def syntax_rule_init
@ These characters will not be enclosed in quotes when writing syntax
rules:
<<Syntax: parameters>>=
character(*), parameter :: &
UNQUOTED = "(),|_"//LCLETTERS//UCLETTERS//DIGITS
@
\subsection{I/O}
Write an account of the rule. Setting [[short]] true will suppress
the node type. Setting [[key_only]] true will suppress the
definition. Setting [[advance]] false will suppress the trailing
newline.
<<Syntax: public>>=
public :: syntax_rule_write
<<Syntax: syntax rule: TBP>>=
procedure :: write => syntax_rule_write
<<Syntax: subroutines>>=
subroutine syntax_rule_write (rule, unit, short, key_only, advance)
class(syntax_rule_t), intent(in) :: rule
integer, intent(in), optional :: unit
logical, intent(in), optional :: short, key_only, advance
logical :: typ, def, adv
integer :: u
u = given_output_unit (unit); if (u < 0) return
typ = .true.; if (present (short)) typ = .not. short
def = .true.; if (present (key_only)) def = .not. key_only
adv = .true.; if (present (advance)) adv = advance
select case (rule%type)
case (S_UNKNOWN); call write_atom ("???", typ)
case (S_IGNORE); call write_atom ("IGNORE", typ)
case (S_LOGICAL); call write_atom ("LOGICAL", typ)
case (S_INTEGER); call write_atom ("INTEGER", typ)
case (S_REAL); call write_atom ("REAL", typ)
case (S_COMPLEX); call write_atom ("COMPLEX", typ)
case (S_IDENTIFIER); call write_atom ("IDENTIFIER", typ)
case (S_KEYWORD); call write_atom ("KEYWORD", typ)
case (S_QUOTED)
call write_quotes (typ, def, &
del = rule%delimiter)
case (S_SEQUENCE)
call write_sequence ("SEQUENCE", typ, def, size (rule%child))
case (S_GROUP)
call write_sequence ("GROUP", typ, def, size (rule%child), &
del = rule%delimiter)
case (S_LIST)
call write_sequence ("LIST", typ, def, size (rule%child), &
sep = rule%separator)
case (S_ARGS)
call write_sequence ("ARGUMENTS", typ, def, size (rule%child), &
del = rule%delimiter, &
sep = rule%separator)
case (S_ALTERNATIVE)
call write_sequence ("ALTERNATIVE", typ, def, size (rule%child), &
sep = var_str ("|"))
end select
if (adv) write (u, *)
contains
subroutine write_type (type)
character(*), intent(in) :: type
character(11) :: str
str = type
write (u, "(1x,A)", advance="no") str
end subroutine write_type
subroutine write_key
write (u, "(1x,A)", advance="no") char (wkey (rule))
end subroutine write_key
subroutine write_atom (type, typ)
character(*), intent(in) :: type
logical, intent(in) :: typ
if (typ) call write_type (type)
call write_key
end subroutine write_atom
subroutine write_maybe_quoted (string)
character(*), intent(in) :: string
character, parameter :: q = "'"
character, parameter :: qq = '"'
if (verify (string, UNQUOTED) == 0) then
write (u, "(1x,A)", advance = "no") trim (string)
else if (verify (string, q) == 0) then
write (u, "(1x,A)", advance = "no") qq // trim (string) // qq
else
write (u, "(1x,A)", advance = "no") q // trim (string) // q
end if
end subroutine write_maybe_quoted
subroutine write_quotes (typ, def, del)
logical, intent(in) :: typ, def
type(string_t), dimension(2), intent(in) :: del
if (typ) call write_type ("QUOTED")
call write_key
if (def) then
write (u, "(1x,'=')", advance="no")
call write_maybe_quoted (char (del(1)))
write (u, "(1x,A)", advance="no") "..."
call write_maybe_quoted (char (del(2)))
end if
end subroutine write_quotes
subroutine write_sequence (type, typ, def, n, del, sep)
character(*), intent(in) :: type
logical, intent(in) :: typ, def
integer, intent(in) :: n
type(string_t), dimension(2), intent(in), optional :: del
type(string_t), intent(in), optional :: sep
integer :: i
if (typ) call write_type (type)
call write_key
if (def) then
write (u, "(1x,'=')", advance="no")
if (present (del)) call write_maybe_quoted (char (del(1)))
do i = 1, n
if (i > 1 .and. present (sep)) &
call write_maybe_quoted (char (sep))
write (u, "(1x,A)", advance="no") &
char (wkey (syntax_rule_get_sub_ptr(rule, i)))
if (i == n) write (u, "(A)", advance="no") trim (rule%modifier)
end do
if (present (del)) call write_maybe_quoted (char (del(2)))
end if
end subroutine write_sequence
end subroutine syntax_rule_write
@ %def syntax_rule_write
@ In the printed representation, the keyword strings are enclosed as
[[<...>]], unless they are bare keywords. Bare keywords are enclosed
as [['..']] if they contain a character which is not a letter, digit,
or underscore. If they contain a single-quote character, they are
enclosed as [[".."]]. (A keyword must not contain both single- and
double-quotes.)
<<Syntax: subroutines>>=
function wkey (rule) result (string)
type(string_t) :: string
type(syntax_rule_t), intent(in) :: rule
select case (rule%type)
case (S_KEYWORD)
if (verify (rule%keyword, UNQUOTED) == 0) then
string = rule%keyword
else if (scan (rule%keyword, "'") == 0) then
string = "'" // rule%keyword // "'"
else
string = '"' // rule%keyword // '"'
end if
case default
string = "<" // rule%keyword // ">"
end select
end function wkey
@ %def wkey
@
\subsection{Completing syntax rules}
Set the separator and delimiter entries, using defaults:
<<Syntax: subroutines>>=
subroutine syntax_rule_set_separator (rule, separator)
type(syntax_rule_t), intent(inout) :: rule
type(string_t), intent(in), optional :: separator
if (present (separator)) then
rule%separator = separator
else
rule%separator = ","
end if
end subroutine syntax_rule_set_separator
subroutine syntax_rule_set_delimiter (rule, delimiter)
type(syntax_rule_t), intent(inout) :: rule
type(string_t), dimension(2), intent(in), optional :: delimiter
if (present (delimiter)) then
rule%delimiter = delimiter
else
rule%delimiter(1) = "("
rule%delimiter(2) = ")"
end if
end subroutine syntax_rule_set_delimiter
@ %def syntax_rule_set_separator syntax_rule_set_delimiter
@ Set the modifier entry and corresponding flags:
<<Syntax: subroutines>>=
function is_modifier (string) result (ok)
logical :: ok
type(string_t), intent(in) :: string
select case (char (string))
case (" ", "?", "*", "+"); ok = .true.
case default; ok = .false.
end select
end function is_modifier
subroutine syntax_rule_set_modifier (rule, modifier)
type(syntax_rule_t), intent(inout) :: rule
type(string_t), intent(in) :: modifier
rule%modifier = char (modifier)
select case (rule%modifier)
case (" ")
case ("?"); rule%opt = .true.
case ("*"); rule%opt = .true.; rule%rep = .true.
case ("+"); rule%rep = .true.
case default
call msg_bug (" Syntax: sequence modifier '" // rule%modifier &
// "' is not one of '+' '*' '?'")
end select
end subroutine syntax_rule_set_modifier
@ %def is_modifier syntax_rule_set_modifier
@ Check a finalized rule for completeness
<<Syntax: subroutines>>=
subroutine syntax_rule_check (rule)
type(syntax_rule_t), intent(in) :: rule
if (rule%keyword == "") call msg_bug ("Rule key not set")
select case (rule%type)
case (S_UNKNOWN); call bug (" Undefined rule")
case (S_IGNORE, S_LOGICAL, S_INTEGER, S_REAL, S_COMPLEX, &
S_IDENTIFIER, S_KEYWORD)
case (S_QUOTED)
if (rule%delimiter(1) == "" .or. rule%delimiter(2) == "") &
call bug (" Missing quote character(s)")
case (S_SEQUENCE)
case (S_GROUP)
if (rule%delimiter(1) == "" .or. rule%delimiter(2) == "") &
call bug (" Missing delimiter(s)")
case (S_LIST)
if (rule%separator == "") call bug (" Missing separator")
case (S_ARGS)
if (rule%delimiter(1) == "" .or. rule%delimiter(2) == "") &
call bug (" Missing delimiter(s)")
if (rule%separator == "") call bug (" Missing separator")
case (S_ALTERNATIVE)
case default
call bug (" Undefined syntax code")
end select
select case (rule%type)
case (S_SEQUENCE, S_GROUP, S_LIST, S_ARGS, S_ALTERNATIVE)
if (allocated (rule%child)) then
if (.not.all (rule_is_associated (rule%child))) &
call bug (" Child rules not all associated")
else
call bug (" Parent rule without children")
end if
case default
if (allocated (rule%child)) call bug (" Non-parent rule with children")
end select
contains
subroutine bug (string)
character(*), intent(in) :: string
call msg_bug (" Syntax table: Rule " // char (rule%keyword) // ": " &
// string)
end subroutine bug
end subroutine syntax_rule_check
@ %def syntax_rule_check
@
\subsection{Accessing rules}
This is the API for syntax rules:
<<Syntax: public>>=
public :: syntax_rule_get_type
<<Syntax: subroutines>>=
function syntax_rule_get_type (rule) result (type)
integer :: type
type(syntax_rule_t), intent(in) :: rule
type = rule%type
end function syntax_rule_get_type
@ %def syntax_rule_get_type
<<Syntax: public>>=
public :: syntax_rule_get_key
<<Syntax: syntax rule: TBP>>=
procedure :: get_key => syntax_rule_get_key
<<Syntax: subroutines>>=
function syntax_rule_get_key (rule) result (key)
class(syntax_rule_t), intent(in) :: rule
type(string_t) :: key
key = rule%keyword
end function syntax_rule_get_key
@ %def syntax_rule_get_key
<<Syntax: public>>=
public :: syntax_rule_get_separator
public :: syntax_rule_get_delimiter
<<Syntax: subroutines>>=
function syntax_rule_get_separator (rule) result (separator)
type(string_t) :: separator
type(syntax_rule_t), intent(in) :: rule
separator = rule%separator
end function syntax_rule_get_separator
function syntax_rule_get_delimiter (rule) result (delimiter)
type(string_t), dimension(2) :: delimiter
type(syntax_rule_t), intent(in) :: rule
delimiter = rule%delimiter
end function syntax_rule_get_delimiter
@ %def syntax_rule_get_separator syntax_rule_get_delimiter
@ Accessing child rules. If we use [[syntax_rule_get_n_sub]] for determining
loop bounds, we do not need a check in the second routine.
<<Syntax: public>>=
public :: syntax_rule_get_n_sub
public :: syntax_rule_get_sub_ptr
<<Syntax: subroutines>>=
function syntax_rule_get_n_sub (rule) result (n)
integer :: n
type(syntax_rule_t), intent(in) :: rule
if (allocated (rule%child)) then
n = size (rule%child)
else
n = 0
end if
end function syntax_rule_get_n_sub
function syntax_rule_get_sub_ptr (rule, i) result (sub)
type(syntax_rule_t), pointer :: sub
type(syntax_rule_t), intent(in), target :: rule
integer, intent(in) :: i
sub => rule%child(i)%p
end function syntax_rule_get_sub_ptr
subroutine syntax_rule_set_sub (rule, i, sub)
type(syntax_rule_t), intent(inout) :: rule
integer, intent(in) :: i
type(syntax_rule_t), intent(in), target :: sub
rule%child(i)%p => sub
end subroutine syntax_rule_set_sub
@ %def syntax_rule_get_n_sub syntax_rule_get_sub_ptr syntax_rule_set_sub
@ Return the modifier flags:
<<Syntax: public>>=
public :: syntax_rule_last_optional
public :: syntax_rule_last_repetitive
<<Syntax: subroutines>>=
function syntax_rule_last_optional (rule) result (opt)
logical :: opt
type(syntax_rule_t), intent(in) :: rule
opt = rule%opt
end function syntax_rule_last_optional
function syntax_rule_last_repetitive (rule) result (rep)
logical :: rep
type(syntax_rule_t), intent(in) :: rule
rep = rule%rep
end function syntax_rule_last_repetitive
@ %def syntax_rule_last_optional syntax_rule_last_repetitive
@
Return true if the rule is atomic, i.e., logical, real, keyword etc.
<<Syntax: public>>=
public :: syntax_rule_is_atomic
<<Syntax: subroutines>>=
function syntax_rule_is_atomic (rule) result (atomic)
logical :: atomic
type(syntax_rule_t), intent(in) :: rule
select case (rule%type)
case (S_LOGICAL, S_INTEGER, S_REAL, S_COMPLEX, S_IDENTIFIER, &
S_KEYWORD, S_QUOTED)
atomic = .true.
case default
atomic = .false.
end select
end function syntax_rule_is_atomic
@ %def syntax_rule_is_atomic
@
\subsection{Syntax tables}
A syntax table contains the tree of syntax rules and, for direct
parser access, the list of valid keywords.
\subsubsection{Types}
The syntax contains an array of rules and a list of keywords. The
array is actually used as a tree, where the top rule is the first
array element, and the other rules are recursively pointed to by this
first rule. (No rule should be used twice or be unused.) The keyword
list is derived from the rule tree.
Objects of this type need the target attribute if they are associated
with a lexer. The keyword list will be pointed to by this lexer.
<<Syntax: public>>=
public :: syntax_t
<<Syntax: types>>=
type :: syntax_t
private
type(syntax_rule_t), dimension(:), allocatable :: rule
type(keyword_list_t) :: keyword_list
end type syntax_t
@ %def syntax_t
@
\subsubsection{Constructor/destructor}
Initialize and finalize syntax tables
<<Syntax: public>>=
public :: syntax_init
public :: syntax_final
@ There are two ways to create a syntax: hard-coded from rules or
dynamically from file.
<<Syntax: interfaces>>=
interface syntax_init
module procedure syntax_init_from_ifile
end interface
@ %def syntax_init
@ The syntax definition is read from an [[ifile]] object which
contains the syntax definitions in textual form, one rule per line.
This interface allows for determining the number of rules beforehand.
To parse the rule definitions, we make up a temporary lexer.
Obviously, we cannot use a generic parser yet, so we have to hardcode
the parsing process.
<<Syntax: subroutines>>=
subroutine syntax_init_from_ifile (syntax, ifile)
type(syntax_t), intent(out), target :: syntax
type(ifile_t), intent(in) :: ifile
type(lexer_t) :: lexer
type(line_p) :: line
type(string_t) :: string
integer :: n_token
integer :: i
call lexer_init (lexer, &
comment_chars = "", &
quote_chars = "<'""", &
quote_match = ">'""", &
single_chars = "?*+|=,()", &
special_class = ["."], &
keyword_list = null ())
allocate (syntax%rule (ifile_get_length (ifile)))
call line_init (line, ifile)
do i = 1, size (syntax%rule)
string = line_get_string_advance (line)
call set_rule_type_and_key (syntax%rule(i), string, lexer)
end do
call line_init (line, ifile)
do i = 1, size (syntax%rule)
string = line_get_string_advance (line)
select case (syntax%rule(i)%type)
case (S_QUOTED, S_SEQUENCE, S_GROUP, S_LIST, S_ARGS, S_ALTERNATIVE)
n_token = get_n_token (string, lexer)
call set_rule_contents &
(syntax%rule(i), syntax, n_token, string, lexer)
end select
end do
call line_final (line)
call lexer_final (lexer)
call syntax_make_keyword_list (syntax)
if (.not. all (syntax%rule%used)) then
do i = 1, size (syntax%rule)
if (.not. syntax%rule(i)%used) then
call syntax_rule_write (syntax%rule(i), 6)
end if
end do
call msg_bug (" Syntax table: unused rules")
end if
end subroutine syntax_init_from_ifile
@ %def syntax_init_from_ifile
@ For a given rule defined in the input, the first task is to
determine its type and key. With these, we can initialize the rule in
the table, postponing the association of children.
<<Syntax: subroutines>>=
subroutine set_rule_type_and_key (rule, string, lexer)
type(syntax_rule_t), intent(inout) :: rule
type(string_t), intent(in) :: string
type(lexer_t), intent(inout) :: lexer
type(stream_t), target :: stream
type(lexeme_t) :: lexeme
type(string_t) :: key
character(2) :: type
call lexer_clear (lexer)
call stream_init (stream, string)
call lexer_assign_stream (lexer, stream)
call lex (lexeme, lexer)
type = lexeme_get_string (lexeme)
call lex (lexeme, lexer)
key = lexeme_get_contents (lexeme)
call stream_final (stream)
if (trim (key) /= "") then
select case (type)
case ("IG"); call syntax_rule_init (rule, key, S_IGNORE)
case ("LO"); call syntax_rule_init (rule, key, S_LOGICAL)
case ("IN"); call syntax_rule_init (rule, key, S_INTEGER)
case ("RE"); call syntax_rule_init (rule, key, S_REAL)
case ("CO"); call syntax_rule_init (rule, key, S_COMPLEX)
case ("ID"); call syntax_rule_init (rule, key, S_IDENTIFIER)
case ("KE"); call syntax_rule_init (rule, key, S_KEYWORD)
case ("QU"); call syntax_rule_init (rule, key, S_QUOTED)
case ("SE"); call syntax_rule_init (rule, key, S_SEQUENCE)
case ("GR"); call syntax_rule_init (rule, key, S_GROUP)
case ("LI"); call syntax_rule_init (rule, key, S_LIST)
case ("AR"); call syntax_rule_init (rule, key, S_ARGS)
case ("AL"); call syntax_rule_init (rule, key, S_ALTERNATIVE)
case default
call lexer_show_location (lexer)
call msg_bug (" Syntax definition: unknown type '" // type // "'")
end select
else
print *, char (string)
call msg_bug (" Syntax definition: empty rule key")
end if
end subroutine set_rule_type_and_key
@ %def set_rule_type_and_key
@ This function returns the number of tokens in an input line.
<<Syntax: subroutines>>=
function get_n_token (string, lexer) result (n)
integer :: n
type(string_t), intent(in) :: string
type(lexer_t), intent(inout) :: lexer
type(stream_t), target :: stream
type(lexeme_t) :: lexeme
integer :: i
call lexer_clear (lexer)
call stream_init (stream, string)
call lexer_assign_stream (lexer, stream)
i = 0
do
call lex (lexeme, lexer)
if (lexeme_is_break (lexeme)) exit
i = i + 1
end do
n = i
call stream_final (stream)
end function get_n_token
@ %def get_n_token
@ This subroutine extracts the rule contents for an input line. There
are three tasks: (1) determine the number of children, depending on
the rule type; (2) find and set the separator and delimiter strings,
if required; (3) scan the child rules, find them in the syntax
table and associate the parent rule with them.
<<Syntax: subroutines>>=
subroutine set_rule_contents (rule, syntax, n_token, string, lexer)
type(syntax_rule_t), intent(inout) :: rule
type(syntax_t), intent(in), target :: syntax
integer, intent(in) :: n_token
type(string_t), intent(in) :: string
type(lexer_t), intent(inout) :: lexer
type(stream_t), target :: stream
type(lexeme_t), dimension(n_token) :: lexeme
integer :: i, n_children
call lexer_clear (lexer)
call stream_init (stream, string)
call lexer_assign_stream (lexer, stream)
do i = 1, n_token
call lex (lexeme(i), lexer)
end do
call stream_final (stream)
n_children = get_n_children ()
call set_delimiters
if (n_children > 1) call set_separator
if (n_children > 0) call set_children
contains
function get_n_children () result (n_children)
integer :: n_children
select case (rule%type)
case (S_QUOTED)
if (n_token /= 6) call broken_rule (rule)
n_children = 0
case (S_GROUP)
if (n_token /= 6) call broken_rule (rule)
n_children = 1
case (S_SEQUENCE)
if (is_modifier (lexeme_get_string (lexeme(n_token)))) then
if (n_token <= 4) call broken_rule (rule)
call syntax_rule_set_modifier &
(rule, lexeme_get_string (lexeme(n_token)))
n_children = n_token - 4
else
if (n_token <= 3) call broken_rule (rule)
n_children = n_token - 3
end if
case (S_LIST)
if (is_modifier (lexeme_get_string (lexeme(n_token)))) then
if (n_token <= 4 .or. mod (n_token, 2) /= 1) &
call broken_rule (rule)
call syntax_rule_set_modifier &
(rule, lexeme_get_string (lexeme(n_token)))
else if (n_token <= 3 .or. mod (n_token, 2) /= 0) then
call broken_rule (rule)
end if
n_children = (n_token - 2) / 2
case (S_ARGS)
if (is_modifier (lexeme_get_string (lexeme(n_token-1)))) then
if (n_token <= 6 .or. mod (n_token, 2) /= 1) &
call broken_rule (rule)
call syntax_rule_set_modifier &
(rule, lexeme_get_string (lexeme(n_token-1)))
else if (n_token <= 5 .or. mod (n_token, 2) /= 0) then
call broken_rule (rule)
end if
n_children = (n_token - 4) / 2
case (S_ALTERNATIVE)
if (n_token <= 3 .or. mod (n_token, 2) /= 0) call broken_rule (rule)
n_children = (n_token - 2) / 2
end select
end function get_n_children
subroutine set_delimiters
type(string_t), dimension(2) :: delimiter
select case (rule%type)
case (S_QUOTED, S_GROUP, S_ARGS)
delimiter(1) = lexeme_get_contents (lexeme(4))
delimiter(2) = lexeme_get_contents (lexeme(n_token))
call syntax_rule_set_delimiter (rule, delimiter)
end select
end subroutine set_delimiters
subroutine set_separator
type(string_t) :: separator
select case (rule%type)
case (S_LIST)
separator = lexeme_get_contents (lexeme(5))
call syntax_rule_set_separator (rule, separator)
case (S_ARGS)
separator = lexeme_get_contents (lexeme(6))
call syntax_rule_set_separator (rule, separator)
end select
end subroutine set_separator
subroutine set_children
allocate (rule%child(n_children))
select case (rule%type)
case (S_GROUP)
call syntax_rule_set_sub (rule, 1, syntax_get_rule_ptr (syntax, &
lexeme_get_contents (lexeme(5))))
case (S_SEQUENCE)
do i = 1, n_children
call syntax_rule_set_sub (rule, i, syntax_get_rule_ptr (syntax, &
lexeme_get_contents (lexeme(i+3))))
end do
case (S_LIST, S_ALTERNATIVE)
do i = 1, n_children
call syntax_rule_set_sub (rule, i, syntax_get_rule_ptr (syntax, &
lexeme_get_contents (lexeme(2*i+2))))
end do
case (S_ARGS)
do i = 1, n_children
call syntax_rule_set_sub (rule, i, syntax_get_rule_ptr (syntax, &
lexeme_get_contents (lexeme(2*i+3))))
end do
end select
end subroutine set_children
subroutine broken_rule (rule)
type(syntax_rule_t), intent(in) :: rule
call lexer_show_location (lexer)
call msg_bug (" Syntax definition: broken rule '" &
// char (wkey (rule)) // "'")
end subroutine broken_rule
end subroutine set_rule_contents
@ %def set_rule_contents
@ This routine completes the syntax table object. We assume that the
rule array is set up. We associate the top rule with the first entry
in the rule array and build up the keyword list.
The keyword list includes delimiters and separators. Filling it can
only be done after all rules are set. We scan the rule tree. For
each keyword that we find, we try to add it to the keyword list; the
pointer to the last element is carried along with the recursive
scanning. Before appending a keyword, we check whether it is already
in the list.
<<Syntax: subroutines>>=
subroutine syntax_make_keyword_list (syntax)
type(syntax_t), intent(inout), target :: syntax
type(syntax_rule_t), pointer :: rule
rule => syntax%rule(1)
call rule_scan_rec (rule, syntax%keyword_list)
contains
recursive subroutine rule_scan_rec (rule, keyword_list)
type(syntax_rule_t), pointer :: rule
type(keyword_list_t), intent(inout) :: keyword_list
integer :: i
if (rule%used) return
rule%used = .true.
select case (rule%type)
case (S_UNKNOWN)
call msg_bug (" Syntax: rule tree contains undefined rule")
case (S_KEYWORD)
call keyword_list_add (keyword_list, rule%keyword)
end select
select case (rule%type)
case (S_LIST, S_ARGS)
call keyword_list_add (keyword_list, rule%separator)
end select
select case (rule%type)
case (S_GROUP, S_ARGS)
call keyword_list_add (keyword_list, rule%delimiter(1))
call keyword_list_add (keyword_list, rule%delimiter(2))
end select
select case (rule%type)
case (S_SEQUENCE, S_GROUP, S_LIST, S_ARGS, S_ALTERNATIVE)
if (.not. allocated (rule%child)) &
call msg_bug (" Syntax: Non-terminal rule without children")
case default
if (allocated (rule%child)) &
call msg_bug (" Syntax: Terminal rule with children")
end select
if (allocated (rule%child)) then
do i = 1, size (rule%child)
call rule_scan_rec (rule%child(i)%p, keyword_list)
end do
end if
end subroutine rule_scan_rec
end subroutine syntax_make_keyword_list
@ %def syntax_make_keyword_list
@ The finalizer deallocates the rule pointer array and deletes the
keyword list.
<<Syntax: subroutines>>=
subroutine syntax_final (syntax)
type(syntax_t), intent(inout) :: syntax
if (allocated (syntax%rule)) deallocate (syntax%rule)
call keyword_list_final (syntax%keyword_list)
end subroutine syntax_final
@ %def syntax_final
@
\subsection{Accessing the syntax table}
Return a pointer to the top rule
<<Syntax: public>>=
public :: syntax_get_top_rule_ptr
<<Syntax: subroutines>>=
function syntax_get_top_rule_ptr (syntax) result (rule)
type(syntax_rule_t), pointer :: rule
type(syntax_t), intent(in), target :: syntax
if (allocated (syntax%rule)) then
rule => syntax%rule(1)
else
rule => null ()
end if
end function syntax_get_top_rule_ptr
@ %def syntax_get_top_rule_ptr
@ Assign the pointer to the rule associated with a given key (assumes
that the rule array is allocated)
<<Syntax: public>>=
public :: syntax_get_rule_ptr
<<Syntax: subroutines>>=
function syntax_get_rule_ptr (syntax, key) result (rule)
type(syntax_rule_t), pointer :: rule
type(syntax_t), intent(in), target :: syntax
type(string_t), intent(in) :: key
integer :: i
do i = 1, size (syntax%rule)
if (syntax%rule(i)%keyword == key) then
rule => syntax%rule(i)
return
end if
end do
call msg_bug (" Syntax table: Rule " // char (key) // " not found")
end function syntax_get_rule_ptr
@ %def syntax_get_rule_ptr
@ Return a pointer to the keyword list
<<Syntax: public>>=
public :: syntax_get_keyword_list_ptr
<<Syntax: subroutines>>=
function syntax_get_keyword_list_ptr (syntax) result (keyword_list)
type(keyword_list_t), pointer :: keyword_list
type(syntax_t), intent(in), target :: syntax
keyword_list => syntax%keyword_list
end function syntax_get_keyword_list_ptr
@ %def syntax_get_keyword_list_ptr
@
\subsection{I/O}
Write a readable representation of the syntax table
<<Syntax: public>>=
public :: syntax_write
<<Syntax: subroutines>>=
subroutine syntax_write (syntax, unit)
type(syntax_t), intent(in) :: syntax
integer, intent(in), optional :: unit
integer :: u
integer :: i
u = given_output_unit (unit); if (u < 0) return
write (u, "(A)") "Syntax table:"
if (allocated (syntax%rule)) then
do i = 1, size (syntax%rule)
call syntax_rule_write (syntax%rule(i), u)
end do
else
write (u, "(1x,A)") "[not allocated]"
end if
call keyword_list_write (syntax%keyword_list, u)
end subroutine syntax_write
@ %def syntax_write
@
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The parser}
On a small scale, the parser interprets the string tokens returned by
the lexer; they are interpreted as numbers, keywords and such and
stored as a typed object. On a large scale, a text is read, parsed,
and a syntax rule set is applied such that the tokens are stored as a
parse tree. Syntax errors are spotted in this process, so the
resulting parse tree is syntactically correct by definition.
<<[[parser.f90]]>>=
<<File header>>
module parser
<<Use kinds>>
<<Use strings>>
use io_units
use format_defs, only: FMT_19
use system_defs, only: DIGITS
use diagnostics
use md5
use lexers
use syntax_rules
<<Standard module head>>
<<Parser: public>>
<<Parser: types>>
<<Parser: interfaces>>
contains
<<Parser: procedures>>
end module parser
@ %def parser
@
\subsection{The token type}
Tokens are elements of the parsed input that carry a value: logical,
integer, real, quoted string, (unquoted) identifier, or known
keyword. Note that non-keyword tokens also have an abstract key
attached to them.
This is an obvious candidate for polymorphism.
<<Parser: types>>=
type :: token_t
private
integer :: type = S_UNKNOWN
logical, pointer :: lval => null ()
integer, pointer :: ival => null ()
real(default), pointer :: rval => null ()
complex(default), pointer :: cval => null ()
type(string_t), pointer :: sval => null ()
type(string_t), pointer :: kval => null ()
type(string_t), dimension(:), pointer :: quote => null ()
end type token_t
@ %def token_t
@ Create a token from the lexeme returned by the lexer: Allocate
storage and try to interpret the lexeme according to the type that is
requested by the parser. For a keyword token, match the lexeme
against the requested key. If successful, set the token type, value,
and key. Otherwise, set the type to [[S_UNKNOWN]].
<<Parser: procedures>>=
subroutine token_init (token, lexeme, requested_type, key)
type(token_t), intent(out) :: token
type(lexeme_t), intent(in) :: lexeme
integer, intent(in) :: requested_type
type(string_t), intent(in) :: key
integer :: type
type = lexeme_get_type (lexeme)
token%type = S_UNKNOWN
select case (requested_type)
case (S_LOGICAL)
if (type == T_IDENTIFIER) call read_logical &
(char (lexeme_get_string (lexeme)))
case (S_INTEGER)
if (type == T_NUMERIC) call read_integer &
(char (lexeme_get_string (lexeme)))
case (S_REAL)
if (type == T_NUMERIC) call read_real &
(char (lexeme_get_string (lexeme)))
case (S_COMPLEX)
if (type == T_NUMERIC) call read_complex &
(char (lexeme_get_string (lexeme)))
case (S_IDENTIFIER)
if (type == T_IDENTIFIER) call read_identifier &
(lexeme_get_string (lexeme))
case (S_KEYWORD)
if (type == T_KEYWORD) call check_keyword &
(lexeme_get_string (lexeme), key)
case (S_QUOTED)
if (type == T_QUOTED) call read_quoted &
(lexeme_get_contents (lexeme), lexeme_get_delimiters (lexeme))
case default
print *, requested_type
call msg_bug (" Invalid token type code requested by the parser")
end select
if (token%type /= S_UNKNOWN) then
allocate (token%kval)
token%kval = key
end if
contains
subroutine read_logical (s)
character(*), intent(in) :: s
select case (s)
case ("t", "T", "true", "TRUE", "y", "Y", "yes", "YES")
allocate (token%lval)
token%lval = .true.
token%type = S_LOGICAL
case ("f", "F", "false", "FALSE", "n", "N", "no", "NO")
allocate (token%lval)
token%lval = .false.
token%type = S_LOGICAL
end select
end subroutine read_logical
subroutine read_integer (s)
character(*), intent(in) :: s
integer :: tmp, iostat
if (verify (s, DIGITS) == 0) then
read (s, *, iostat=iostat) tmp
if (iostat == 0) then
allocate (token%ival)
token%ival = tmp
token%type = S_INTEGER
end if
end if
end subroutine read_integer
subroutine read_real (s)
character(*), intent(in) :: s
real(default) :: tmp
integer :: iostat
read (s, *, iostat=iostat) tmp
if (iostat == 0) then
allocate (token%rval)
token%rval = tmp
token%type = S_REAL
end if
end subroutine read_real
subroutine read_complex (s)
character(*), intent(in) :: s
complex(default) :: tmp
integer :: iostat
read (s, *, iostat=iostat) tmp
if (iostat == 0) then
allocate (token%cval)
token%cval = tmp
token%type = S_COMPLEX
end if
end subroutine read_complex
subroutine read_identifier (s)
type(string_t), intent(in) :: s
allocate (token%sval)
token%sval = s
token%type = S_IDENTIFIER
end subroutine read_identifier
subroutine check_keyword (s, key)
type(string_t), intent(in) :: s
type(string_t), intent(in) :: key
if (key == s) token%type = S_KEYWORD
end subroutine check_keyword
subroutine read_quoted (s, del)
type(string_t), intent(in) :: s
type(string_t), dimension(2), intent(in) :: del
allocate (token%sval, token%quote(2))
token%sval = s
token%quote(1) = del(1)
token%quote(2) = del(2)
token%type = S_QUOTED
end subroutine read_quoted
end subroutine token_init
@ %def token_init
@ Manually set a token to a keyword.
<<Parser: procedures>>=
subroutine token_init_key (token, key)
type(token_t), intent(out) :: token
type(string_t), intent(in) :: key
token%type = S_KEYWORD
allocate (token%kval)
token%kval = key
end subroutine token_init_key
@ %def token_init_key
@ Reset a token to an empty state, freeing allocated memory, and
deallocate the token itself.
<<Parser: procedures>>=
subroutine token_final (token)
type(token_t), intent(inout) :: token
token%type = S_UNKNOWN
if (associated (token%lval)) deallocate (token%lval)
if (associated (token%ival)) deallocate (token%ival)
if (associated (token%rval)) deallocate (token%rval)
if (associated (token%sval)) deallocate (token%sval)
if (associated (token%kval)) deallocate (token%kval)
if (associated (token%quote)) deallocate (token%quote)
end subroutine token_final
@ %def token_final
@ Check for empty=valid token:
<<Parser: procedures>>=
function token_is_valid (token) result (valid)
logical :: valid
type(token_t), intent(in) :: token
valid = token%type /= S_UNKNOWN
end function token_is_valid
@ %def token_is_valid
@ Write the contents of a token.
<<Parser: procedures>>=
subroutine token_write (token, unit)
type(token_t), intent(in) :: token
integer, intent(in), optional :: unit
integer :: u
u = given_output_unit (unit); if (u < 0) return
select case (token%type)
case (S_LOGICAL)
write (u, "(L1)") token%lval
case (S_INTEGER)
write (u, "(I0)") token%ival
case (S_REAL)
write (u, "(" // FMT_19 // ")") token%rval
case (S_COMPLEX)
write (u, "('('," // FMT_19 // ",','," // FMT_19 // ",')')") token%cval
case (S_IDENTIFIER)
write (u, "(A)") char (token%sval)
case (S_KEYWORD)
write (u, "(A,A)") '[keyword] ' // char (token%kval)
case (S_QUOTED)
write (u, "(A)") &
char (token%quote(1)) // char (token%sval) // char (token%quote(2))
case default
write (u, "(A)") '[empty]'
end select
end subroutine token_write
@ %def token_write
@ Token assignment via deep copy. This is useful to avoid confusion
when the token is transferred to some parse-tree node.
<<Parser: interfaces>>=
interface assignment(=)
module procedure token_assign
module procedure token_assign_integer
module procedure token_assign_real
module procedure token_assign_complex
module procedure token_assign_logical
module procedure token_assign_string
end interface
@ %def =
@ We need to copy only the contents that are actually assigned, the
other pointers remain disassociated.
<<Parser: procedures>>=
subroutine token_assign (token, token_in)
type(token_t), intent(out) :: token
type(token_t), intent(in) :: token_in
token%type = token_in%type
select case (token%type)
case (S_LOGICAL); allocate (token%lval); token%lval = token_in%lval
case (S_INTEGER); allocate (token%ival); token%ival = token_in%ival
case (S_REAL); allocate (token%rval); token%rval = token_in%rval
case (S_COMPLEX); allocate (token%cval); token%cval = token_in%cval
case (S_IDENTIFIER); allocate (token%sval); token%sval = token_in%sval
case (S_QUOTED); allocate (token%sval); token%sval = token_in%sval
allocate (token%quote(2)); token%quote = token_in%quote
end select
if (token%type /= S_UNKNOWN) then
allocate (token%kval); token%kval = token_in%kval
end if
end subroutine token_assign
@ %def token_assign
@ We need to copy only the contents that are actually assigned, the
other pointers remain disassociated.
<<Parser: procedures>>=
subroutine token_assign_integer (token, ival)
type(token_t), intent(out) :: token
integer, intent(in) :: ival
token%type = S_INTEGER
allocate (token%ival)
token%ival = ival
end subroutine token_assign_integer
subroutine token_assign_real (token, rval)
type(token_t), intent(out) :: token
real(default), intent(in) :: rval
token%type = S_REAL
allocate (token%rval)
token%rval = rval
end subroutine token_assign_real
subroutine token_assign_complex (token, cval)
type(token_t), intent(out) :: token
complex(default), intent(in) :: cval
token%type = S_COMPLEX
allocate (token%cval)
token%cval = cval
end subroutine token_assign_complex
subroutine token_assign_logical (token, lval)
type(token_t), intent(out) :: token
logical, intent(in) :: lval
token%type = S_LOGICAL
allocate (token%lval)
token%lval = lval
end subroutine token_assign_logical
subroutine token_assign_string (token, sval)
type(token_t), intent(out) :: token
type(string_t), intent(in) :: sval
token%type = S_QUOTED
allocate (token%sval)
token%sval = sval
allocate (token%quote(2)); token%quote = '"'
end subroutine token_assign_string
@ %def token_assign
@
\subsection{Retrieve token contents}
These functions all do a trivial sanity check that should avoid
crashes.
<<Parser: procedures>>=
function token_get_logical (token) result (lval)
logical :: lval
type(token_t), intent(in) :: token
if (associated (token%lval)) then
lval = token%lval
else
call token_mismatch (token, "logical")
end if
end function token_get_logical
function token_get_integer (token) result (ival)
integer :: ival
type(token_t), intent(in) :: token
if (associated (token%ival)) then
ival = token%ival
else
call token_mismatch (token, "integer")
end if
end function token_get_integer
function token_get_real (token) result (rval)
real(default) :: rval
type(token_t), intent(in) :: token
if (associated (token%rval)) then
rval = token%rval
else
call token_mismatch (token, "real")
end if
end function token_get_real
function token_get_cmplx (token) result (cval)
complex(default) :: cval
type(token_t), intent(in) :: token
if (associated (token%cval)) then
cval = token%cval
else
call token_mismatch (token, "complex")
end if
end function token_get_cmplx
function token_get_string (token) result (sval)
type(string_t) :: sval
type(token_t), intent(in) :: token
if (associated (token%sval)) then
sval = token%sval
else
call token_mismatch (token, "string")
end if
end function token_get_string
function token_get_key (token) result (kval)
type(string_t) :: kval
type(token_t), intent(in) :: token
if (associated (token%kval)) then
kval = token%kval
else
call token_mismatch (token, "keyword")
end if
end function token_get_key
function token_get_quote (token) result (quote)
type(string_t), dimension(2) :: quote
type(token_t), intent(in) :: token
if (associated (token%quote)) then
quote = token%quote
else
call token_mismatch (token, "quote")
end if
end function token_get_quote
@ %def token_get_logical token_get_integer token_get_real
@ %def token_get_string token_get_key token_get_quote
<<Parser: procedures>>=
subroutine token_mismatch (token, type)
type(token_t), intent(in) :: token
character(*), intent(in) :: type
write (6, "(A)", advance="no") "Token: "
call token_write (token)
call msg_bug (" Token type mismatch; value required as " // type)
end subroutine token_mismatch
@ %def token_mismatch
@
\subsection{The parse tree: nodes}
The parser will generate a parse tree from the input stream. Each
node in this parse tree points to the syntax rule that was applied.
(Since syntax rules are stored in a pointer-type array within the
syntax table, they qualify as targets.) A leaf node contains a
token. A branch node has subnodes. The subnodes are stored as a
list, so each node also has a [[next]] pointer.
<<Parser: public>>=
public :: parse_node_t
<<Parser: types>>=
type :: parse_node_t
private
type(syntax_rule_t), pointer :: rule => null ()
type(token_t) :: token
integer :: n_sub = 0
type(parse_node_t), pointer :: sub_first => null ()
type(parse_node_t), pointer :: sub_last => null ()
type(parse_node_t), pointer :: next => null ()
contains
<<Parser: parse node: TBP>>
end type parse_node_t
@ %def parse_node_t
@ Container for parse node pointers, useful for creating pointer arrays:
<<Parser: public>>=
public :: parse_node_p
<<Parser: types>>=
type :: parse_node_p
type(parse_node_t), pointer :: ptr => null ()
end type parse_node_p
@ %def parse_node_p
@ Output. The first version writes a node together with its
sub-node tree, organized by indentation.
<<Parser: parse node: TBP>>=
procedure :: write => parse_node_write_rec
<<Parser: public>>=
public :: parse_node_write_rec
<<Parser: procedures>>=
recursive subroutine parse_node_write_rec (node, unit, short, depth)
class(parse_node_t), intent(in), target :: node
integer, intent(in), optional :: unit
logical, intent(in), optional :: short
integer, intent(in), optional :: depth
integer :: u, d
type(parse_node_t), pointer :: current
u = given_output_unit (unit); if (u < 0) return
d = 0; if (present (depth)) d = depth
call parse_node_write (node, u, short=short)
current => node%sub_first
do while (associated (current))
write (u, "(A)", advance = "no") repeat ("| ", d)
call parse_node_write_rec (current, unit, short, d+1)
current => current%next
end do
end subroutine parse_node_write_rec
@ %def parse_node_write_rec
@ This does the actual output for a single node, without recursion.
<<Parser: public>>=
public :: parse_node_write
<<Parser: procedures>>=
subroutine parse_node_write (node, unit, short)
class(parse_node_t), intent(in) :: node
integer, intent(in), optional :: unit
logical, intent(in), optional :: short
integer :: u
type(parse_node_t), pointer :: current
u = given_output_unit (unit); if (u < 0) return
write (u, "('+ ')", advance = "no")
if (associated (node%rule)) then
call syntax_rule_write (node%rule, u, &
short=short, key_only=.true., advance=.false.)
if (token_is_valid (node%token)) then
write (u, "(' = ')", advance="no")
call token_write (node%token, u)
else if (associated (node%sub_first)) then
write (u, "(' = ')", advance="no")
current => node%sub_first
do while (associated (current))
call syntax_rule_write (current%rule, u, &
short=.true., key_only=.true., advance=.false.)
current => current%next
end do
write (u, *)
else
write (u, *)
end if
else
write (u, *) "[empty]"
end if
end subroutine parse_node_write
@ %def parse_node_write
@ Finalize the token and recursively finalize and deallocate all
sub-nodes.
<<Parser: public>>=
public :: parse_node_final
<<Parser: procedures>>=
recursive subroutine parse_node_final (node, recursive)
type(parse_node_t), intent(inout) :: node
type(parse_node_t), pointer :: current
logical, intent(in), optional :: recursive
logical :: rec
rec = .true.; if (present (recursive)) rec = recursive
call token_final (node%token)
if (rec) then
do while (associated (node%sub_first))
current => node%sub_first
node%sub_first => node%sub_first%next
call parse_node_final (current)
deallocate (current)
end do
end if
end subroutine parse_node_final
@ %def parse_node_final
@
\subsection{Filling nodes}
The constructors allocate and initialize the node. There are two
possible initializers (in a later version, should correspond to
different type extensions).
First, leaf (terminal) nodes. The token constructor does the actual
work, looking at the requested type and key for the given rule and
matching against the lexeme contents. If it fails, the token will
keep the type [[S_UNKNOWN]] and remain empty. Otherwise, we have a
valid node which contains the new token.
If the lexeme argument is absent, the token is left empty.
<<Parser: procedures>>=
subroutine parse_node_create_leaf (node, rule, lexeme)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
type(lexeme_t), intent(in) :: lexeme
allocate (node)
node%rule => rule
call token_init (node%token, lexeme, &
syntax_rule_get_type (rule), syntax_rule_get_key (rule))
if (.not. token_is_valid (node%token)) deallocate (node)
end subroutine parse_node_create_leaf
@ %def parse_node_create_leaf
@ This version allows us to manually create a leaf node that holds a keyword.
<<Parser: public>>=
public :: parse_node_create_key
<<Parser: procedures>>=
subroutine parse_node_create_key (node, rule)
type(parse_node_t), intent(out) :: node
type(syntax_rule_t), intent(in), target :: rule
node%rule => rule
call token_init_key (node%token, syntax_rule_get_key (rule))
end subroutine parse_node_create_key
@ %def parse_node_create_key
@ This version allows us to manually create a leaf node that holds a fixed
value. Only one of the optional values should be provided.
<<Parser: public>>=
public :: parse_node_create_value
<<Parser: procedures>>=
subroutine parse_node_create_value (node, rule, ival, rval, cval, sval, lval)
type(parse_node_t), intent(out) :: node
type(syntax_rule_t), intent(in), target :: rule
integer, intent(in), optional :: ival
real(default), intent(in), optional :: rval
complex(default), intent(in), optional :: cval
type(string_t), intent(in), optional :: sval
logical, intent(in), optional :: lval
node%rule => rule
call parse_node_set_value (node, ival, rval, cval, sval, lval)
end subroutine parse_node_create_value
@ %def parse_node_create_value
@ Directly set the value without changing anything else.
<<Parser: public>>=
public :: parse_node_set_value
<<Parser: procedures>>=
subroutine parse_node_set_value (node, ival, rval, cval, sval, lval)
type(parse_node_t), intent(inout) :: node
integer, intent(in), optional :: ival
real(default), intent(in), optional :: rval
complex(default), intent(in), optional :: cval
type(string_t), intent(in), optional :: sval
logical, intent(in), optional :: lval
if (present (ival)) then
node%token = ival
else if (present (rval)) then
node%token = rval
else if (present (cval)) then
node%token = cval
else if (present (lval)) then
node%token = lval
else if (present (sval)) then
node%token = sval
end if
end subroutine parse_node_set_value
@ %def parse_node_set_value
@ Second, branch nodes. We first assign the rule:
<<Parser: public>>=
public :: parse_node_create_branch
<<Parser: procedures>>=
subroutine parse_node_create_branch (node, rule)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
allocate (node)
node%rule => rule
end subroutine parse_node_create_branch
@ %def parse_node_create_branch
@ Copy a node. This is a shallow copy. Note that we have to nullify the
[[next]] pointer if we don't want to inherit the context of the original node.
<<Parser: parse node: TBP>>=
procedure :: copy => parse_node_copy
<<Parser: procedures>>=
subroutine parse_node_copy (node, copy)
class(parse_node_t), intent(in) :: node
type(parse_node_t), pointer, intent(out) :: copy
allocate (copy)
select type (node)
type is (parse_node_t)
copy = node
end select
copy%next => null ()
end subroutine parse_node_copy
@ %def parse_node_copy
@ Append a sub-node. The sub-node must exist and be a valid target,
otherwise nothing is done.
<<Parser: public>>=
public :: parse_node_append_sub
<<Parser: parse node: TBP>>=
procedure :: append_sub => parse_node_append_sub
<<Parser: procedures>>=
subroutine parse_node_append_sub (node, sub)
class(parse_node_t), intent(inout) :: node
type(parse_node_t), pointer :: sub
if (associated (sub)) then
if (associated (node%sub_last)) then
node%sub_last%next => sub
else
node%sub_first => sub
end if
node%sub_last => sub
end if
end subroutine parse_node_append_sub
@ %def parse_node_append_sub
@ For easy access, once the list is complete we count the number of
sub-nodes. If there are no subnodes, the whole node is deleted.
<<Parser: public>>=
public :: parse_node_freeze_branch
<<Parser: procedures>>=
subroutine parse_node_freeze_branch (node)
type(parse_node_t), pointer :: node
type(parse_node_t), pointer :: current
node%n_sub = 0
current => node%sub_first
do while (associated (current))
node%n_sub = node%n_sub + 1
current => current%next
end do
if (node%n_sub == 0) deallocate (node)
end subroutine parse_node_freeze_branch
@ %def parse_node_freeze_branch
@ Replace the syntax rule. This makes sense only if the parse node
adheres to the syntax of the new rule.
<<Parser: public>>=
public :: parse_node_replace_rule
<<Parser: procedures>>=
subroutine parse_node_replace_rule (node, rule)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
node%rule => rule
end subroutine parse_node_replace_rule
@ %def parse_node_replace_rule
@ Replace the last subnode by the target node. Since the subnodes are
stored as a linked list, we can do this only if we copy the subnodes.
Furthermore, the target node must also be copied, and the [[next]]
pointer of the copy is nullified. This ensures that we cannot modify
the originals at the subnode level.
All copies are shallow copies. This implies that further
modifications at the sub-subnode level will affect the original nodes
and must therefore be forbidden.
Use with care, this invites to memory mismanagement. The copy nodes
can be deallocated, but not finalized, since its subnodes are the same
objects as the subnodes of the target node.
<<Parser: public>>=
public :: parse_node_replace_last_sub
<<Parser: procedures>>=
subroutine parse_node_replace_last_sub (node, pn_target)
type(parse_node_t), intent(inout), target :: node
type(parse_node_t), intent(in), target :: pn_target
type(parse_node_t), pointer :: current, current_copy, previous
integer :: i
select case (node%n_sub)
case (1)
allocate (current_copy)
current_copy = pn_target
node%sub_first => current_copy
case (2:)
current => node%sub_first
allocate (current_copy)
current_copy = current
node%sub_first => current_copy
previous => current_copy
do i = 1, node%n_sub - 2
current => current%next
allocate (current_copy)
current_copy = current
previous%next => current_copy
previous => current_copy
end do
allocate (current_copy)
current_copy = pn_target
previous%next => current_copy
case default
call parse_node_write (node)
call msg_bug ("'replace_last_sub' called for non-branch parse node")
end select
current_copy%next => null ()
node%sub_last => current_copy
end subroutine parse_node_replace_last_sub
@ %def parse_node_replace_last_sub
@
\subsection{Accessing nodes}
Return the node contents. Check if pointers are associated. No
check when accessing a sub-node; assume that [[parse_node_n_sub]] is always
used for the upper bound.
The token extractor returns a pointer.
<<Parser: public>>=
public :: parse_node_get_rule_ptr
public :: parse_node_get_n_sub
public :: parse_node_get_sub_ptr
public :: parse_node_get_next_ptr
public :: parse_node_get_last_sub_ptr
<<Parser: parse node: TBP>>=
procedure :: get_rule_ptr => parse_node_get_rule_ptr
procedure :: get_n_sub => parse_node_get_n_sub
procedure :: get_sub_ptr => parse_node_get_sub_ptr
procedure :: get_next_ptr => parse_node_get_next_ptr
<<Parser: procedures>>=
function parse_node_get_rule_ptr (node) result (rule)
class(parse_node_t), intent(in) :: node
type(syntax_rule_t), pointer :: rule
if (associated (node%rule)) then
rule => node%rule
else
rule => null ()
call parse_node_undefined (node, "rule")
end if
end function parse_node_get_rule_ptr
function parse_node_get_n_sub (node) result (n)
class(parse_node_t), intent(in) :: node
integer :: n
n = node%n_sub
end function parse_node_get_n_sub
function parse_node_get_sub_ptr (node, n, tag, required) result (sub)
class(parse_node_t), intent(in), target :: node
type(parse_node_t), pointer :: sub
integer, intent(in), optional :: n
character(*), intent(in), optional :: tag
logical, intent(in), optional :: required
integer :: i
sub => node%sub_first
if (present (n)) then
do i = 2, n
if (associated (sub)) then
sub => sub%next
else
return
end if
end do
end if
call parse_node_check (sub, tag, required)
end function parse_node_get_sub_ptr
function parse_node_get_next_ptr (sub, n, tag, required) result (next)
class(parse_node_t), intent(in), target :: sub
type(parse_node_t), pointer :: next
integer, intent(in), optional :: n
character(*), intent(in), optional :: tag
logical, intent(in), optional :: required
integer :: i
next => sub%next
if (present (n)) then
do i = 2, n
if (associated (next)) then
next => next%next
else
exit
end if
end do
end if
call parse_node_check (next, tag, required)
end function parse_node_get_next_ptr
function parse_node_get_last_sub_ptr (node, tag, required) result (sub)
type(parse_node_t), pointer :: sub
type(parse_node_t), intent(in), target :: node
character(*), intent(in), optional :: tag
logical, intent(in), optional :: required
sub => node%sub_last
call parse_node_check (sub, tag, required)
end function parse_node_get_last_sub_ptr
@ %def parse_node_get_rule_ptr
@ %def parse_node_get_n_sub parse_node_get_sub_ptr
@ %def parse_node_get_next_ptr
@ %def parse_node_get_last_sub_ptr
<<Parser: procedures>>=
subroutine parse_node_undefined (node, obj)
type(parse_node_t), intent(in) :: node
character(*), intent(in) :: obj
call parse_node_write (node, 6)
call msg_bug (" Parse-tree node: " // obj // " requested, but undefined")
end subroutine parse_node_undefined
@ %def parse_node_undefined
@ Check if a parse node has a particular tag, and if it is associated:
<<Parser: public>>=
public :: parse_node_check
<<Parser: procedures>>=
subroutine parse_node_check (node, tag, required)
type(parse_node_t), pointer :: node
character(*), intent(in), optional :: tag
logical, intent(in), optional :: required
if (associated (node)) then
if (present (tag)) then
if (parse_node_get_rule_key (node) /= tag) &
call parse_node_mismatch (tag, node)
end if
else
if (present (required)) then
if (required) &
call msg_bug (" Missing node, expected <" // tag // ">")
end if
end if
end subroutine parse_node_check
@ %def parse_node_check
@ This is called by a parse-tree scanner if the expected and the
actual nodes do not match
<<Parser: public>>=
public :: parse_node_mismatch
<<Parser: procedures>>=
subroutine parse_node_mismatch (string, parse_node)
character(*), intent(in) :: string
type(parse_node_t), intent(in) :: parse_node
call parse_node_write (parse_node)
call msg_bug (" Syntax mismatch, expected <" // string // ">.")
end subroutine parse_node_mismatch
@ %def parse_node_mismatch
@ The following functions are wrappers for extracting the token contents.
<<Parser: public>>=
public :: parse_node_get_logical
public :: parse_node_get_integer
public :: parse_node_get_real
public :: parse_node_get_cmplx
public :: parse_node_get_string
public :: parse_node_get_key
public :: parse_node_get_rule_key
<<Parser: parse node: TBP>>=
procedure :: get_logical => parse_node_get_logical
procedure :: get_integer => parse_node_get_integer
procedure :: get_real => parse_node_get_real
procedure :: get_cmplx => parse_node_get_cmplx
procedure :: get_string => parse_node_get_string
procedure :: get_key => parse_node_get_key
procedure :: get_rule_key => parse_node_get_rule_key
<<Parser: procedures>>=
function parse_node_get_logical (node) result (lval)
class(parse_node_t), intent(in), target :: node
logical :: lval
lval = token_get_logical (parse_node_get_token_ptr (node))
end function parse_node_get_logical
function parse_node_get_integer (node) result (ival)
class(parse_node_t), intent(in), target :: node
integer :: ival
ival = token_get_integer (parse_node_get_token_ptr (node))
end function parse_node_get_integer
function parse_node_get_real (node) result (rval)
class(parse_node_t), intent(in), target :: node
real(default) :: rval
rval = token_get_real (parse_node_get_token_ptr (node))
end function parse_node_get_real
function parse_node_get_cmplx (node) result (cval)
class(parse_node_t), intent(in), target :: node
complex(default) :: cval
cval = token_get_cmplx (parse_node_get_token_ptr (node))
end function parse_node_get_cmplx
function parse_node_get_string (node) result (sval)
class(parse_node_t), intent(in), target :: node
type(string_t) :: sval
sval = token_get_string (parse_node_get_token_ptr (node))
end function parse_node_get_string
function parse_node_get_key (node) result (kval)
class(parse_node_t), intent(in), target :: node
type(string_t) :: kval
kval = token_get_key (parse_node_get_token_ptr (node))
end function parse_node_get_key
function parse_node_get_rule_key (node) result (kval)
class(parse_node_t), intent(in), target :: node
type(string_t) :: kval
kval = syntax_rule_get_key (parse_node_get_rule_ptr (node))
end function parse_node_get_rule_key
function parse_node_get_token_ptr (node) result (token)
type(token_t), pointer :: token
type(parse_node_t), intent(in), target :: node
if (token_is_valid (node%token)) then
token => node%token
else
call parse_node_undefined (node, "token")
end if
end function parse_node_get_token_ptr
@ %def parse_node_get_logical parse_node_get_integer parse_node_get_real
@ %def parse_node_get_string parse_node_get_key parse_node_get_rule_key
@ %def parse_node_get_token_ptr
@ Return a MD5 sum for a parse node. The method is to write the node
to a scratch file and to evaluate the MD5 sum of that file.
<<Parser: public>>=
public :: parse_node_get_md5sum
<<Parser: procedures>>=
function parse_node_get_md5sum (pn) result (md5sum_pn)
character(32) :: md5sum_pn
type(parse_node_t), intent(in) :: pn
integer :: u
u = free_unit ()
open (unit = u, status = "scratch", action = "readwrite")
call parse_node_write_rec (pn, unit=u)
rewind (u)
md5sum_pn = md5sum (u)
close (u)
end function parse_node_get_md5sum
@ %def parse_node_get_md5sum
@
\subsection{The parse tree}
The parse tree is a tree of nodes, where leaf nodes hold a valid
token, while branch nodes point to a list of sub-nodes.
<<Parser: public>>=
public :: parse_tree_t
<<Parser: types>>=
type :: parse_tree_t
private
type(parse_node_t), pointer :: root_node => null ()
contains
<<Parser: parse tree: TBP>>
end type parse_tree_t
@ %def parse_tree_t
@ The parser. Its arguments are the parse tree (which should be empty
initially), the lexer (which should be already set up), the syntax
table (which should be valid), and the input stream. The input stream
is completely parsed, using the lexer setup and the syntax rules as
given, and the parse tree is built accordingly.
If [[check_eof]] is absent or true, the parser will complain about
trailing garbage. Otherwise, it will ignore it.
By default, the input stream is matched against the top rule in the
specified syntax. If [[key]] is given, it is matched against the rule
with the specified key instead.
Failure at the top level means that no rule could match at all; in
this case the error message will show the top rule.
<<Parser: public>>=
public :: parse_tree_init
<<Parser: parse tree: TBP>>=
procedure :: parse => parse_tree_init
<<Parser: procedures>>=
subroutine parse_tree_init &
(parse_tree, syntax, lexer, key, check_eof)
class(parse_tree_t), intent(inout) :: parse_tree
type(lexer_t), intent(inout) :: lexer
type(syntax_t), intent(in), target :: syntax
type(string_t), intent(in), optional :: key
logical, intent(in), optional :: check_eof
type(syntax_rule_t), pointer :: rule
type(lexeme_t) :: lexeme
type(parse_node_t), pointer :: node
logical :: ok, check
check = .true.; if (present (check_eof)) check = check_eof
call lexer_clear (lexer)
if (present (key)) then
rule => syntax_get_rule_ptr (syntax, key)
else
rule => syntax_get_top_rule_ptr (syntax)
end if
if (associated (rule)) then
call parse_node_match_rule (node, rule, ok)
if (ok) then
parse_tree%root_node => node
else
call parse_error (rule, lexeme)
end if
if (check) then
call lex (lexeme, lexer)
if (.not. lexeme_is_eof (lexeme)) then
call lexer_show_location (lexer)
call msg_fatal (" Syntax error " &
// "(at or before the location indicated above)")
end if
end if
else
call msg_bug (" Parser failed because syntax is empty")
end if
contains
<<Parser: internal subroutines of [[parse_tree_init]]>>
end subroutine parse_tree_init
@ %def parse
@ The parser works recursively, following the rule tree, building the
tree of nodes on the fly. If the given rule matches, the node is
filled on return. If not, the node remains empty.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
recursive subroutine parse_node_match_rule (node, rule, ok)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
logical, intent(out) :: ok
logical, parameter :: debug = .false.
integer :: type
if (debug) write (6, "(A)", advance="no") "Parsing rule: "
if (debug) call syntax_rule_write (rule, 6)
node => null ()
type = syntax_rule_get_type (rule)
if (syntax_rule_is_atomic (rule)) then
call lex (lexeme, lexer)
if (debug) write (6, "(A)", advance="no") "Token: "
if (debug) call lexeme_write (lexeme, 6)
call parse_node_create_leaf (node, rule, lexeme)
ok = associated (node)
if (.not. ok) call lexer_put_back (lexer, lexeme)
else
select case (type)
case (S_ALTERNATIVE); call parse_alternative (node, rule, ok)
case (S_GROUP); call parse_group (node, rule, ok)
case (S_SEQUENCE); call parse_sequence (node, rule, .false., ok)
case (S_LIST); call parse_sequence (node, rule, .true., ok)
case (S_ARGS); call parse_args (node, rule, ok)
case (S_IGNORE); call parse_ignore (node, ok)
end select
end if
if (debug) then
if (ok) then
write (6, "(A)", advance="no") "Matched rule: "
else
write (6, "(A)", advance="no") "Failed rule: "
end if
call syntax_rule_write (rule)
if (associated (node)) call parse_node_write (node)
end if
end subroutine parse_node_match_rule
@ %def parse_node_match_rule
@ Parse an alternative: try each case. If the match succeeds, the
node has been filled, so return. If nothing works, return failure.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
recursive subroutine parse_alternative (node, rule, ok)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
logical, intent(out) :: ok
integer :: i
do i = 1, syntax_rule_get_n_sub (rule)
call parse_node_match_rule (node, syntax_rule_get_sub_ptr (rule, i), ok)
if (ok) return
end do
ok = .false.
end subroutine parse_alternative
@ %def parse_alternative
@ Parse a group: the first and third lexemes have to be the
delimiters, the second one is parsed as the actual node, using now the
child rule. If the first match fails, return with failure. If the
other matches fail, issue an error, since we cannot lex back more than
one item.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
recursive subroutine parse_group (node, rule, ok)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
logical, intent(out) :: ok
type(string_t), dimension(2) :: delimiter
delimiter = syntax_rule_get_delimiter (rule)
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) == delimiter(1)) then
call parse_node_match_rule (node, syntax_rule_get_sub_ptr (rule, 1), ok)
if (ok) then
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) == delimiter(2)) then
ok = .true.
else
call parse_error (rule, lexeme)
end if
else
call parse_error (rule, lexeme)
end if
else
call lexer_put_back (lexer, lexeme)
ok = .false.
end if
end subroutine parse_group
@ %def parse_group
@ Parsing a sequence. The last rule element may be special: optional
and/or repetitive. Each sub-node that matches is appended to the
sub-node list of the parent node.
If [[sep]] is true, we look for a separator after each element.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
recursive subroutine parse_sequence (node, rule, sep, ok)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
logical, intent(in) :: sep
logical, intent(out) :: ok
type(parse_node_t), pointer :: current
integer :: i, n
logical :: opt, rep, cont
type(string_t) :: separator
call parse_node_create_branch (node, rule)
if (sep) separator = syntax_rule_get_separator (rule)
n = syntax_rule_get_n_sub (rule)
opt = syntax_rule_last_optional (rule)
rep = syntax_rule_last_repetitive (rule)
ok = .true.
cont = .true.
SCAN_RULE: do i = 1, n
call parse_node_match_rule &
(current, syntax_rule_get_sub_ptr (rule, i), cont)
if (cont) then
call parse_node_append_sub (node, current)
if (sep .and. (i<n .or. rep)) then
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) /= separator) then
call lexer_put_back (lexer, lexeme)
cont = .false.
exit SCAN_RULE
end if
end if
else
if (i == n .and. opt) then
exit SCAN_RULE
else if (i == 1) then
ok = .false.
exit SCAN_RULE
else
call parse_error (rule, lexeme)
end if
end if
end do SCAN_RULE
if (rep) then
do while (cont)
call parse_node_match_rule &
(current, syntax_rule_get_sub_ptr (rule, n), cont)
if (cont) then
call parse_node_append_sub (node, current)
if (sep) then
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) /= separator) then
call lexer_put_back (lexer, lexeme)
cont = .false.
end if
end if
else
if (sep) call parse_error (rule, lexeme)
end if
end do
end if
call parse_node_freeze_branch (node)
end subroutine parse_sequence
@ %def parse_sequence
@ Argument list: We use the [[parse_group]] code, but call
[[parse_sequence]] inside.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
recursive subroutine parse_args (node, rule, ok)
type(parse_node_t), pointer :: node
type(syntax_rule_t), intent(in), target :: rule
logical, intent(out) :: ok
type(string_t), dimension(2) :: delimiter
delimiter = syntax_rule_get_delimiter (rule)
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) == delimiter(1)) then
call parse_sequence (node, rule, .true., ok)
if (ok) then
call lex (lexeme, lexer)
if (lexeme_get_string (lexeme) == delimiter(2)) then
ok = .true.
else
call parse_error (rule, lexeme)
end if
else
call parse_error (rule, lexeme)
end if
else
call lexer_put_back (lexer, lexeme)
ok = .false.
end if
end subroutine parse_args
@ %def parse_args
@ The IGNORE syntax reads one lexeme and discards it if it is numeric,
logical or string/identifier (but not a keyword). This is a
successful match. Otherwise, the match fails. The node
pointer is returned disassociated in any case.
<<Parser: internal subroutines of [[parse_tree_init]]>>=
subroutine parse_ignore (node, ok)
type(parse_node_t), pointer :: node
logical, intent(out) :: ok
call lex (lexeme, lexer)
select case (lexeme_get_type (lexeme))
case (T_NUMERIC, T_IDENTIFIER, T_QUOTED)
ok = .true.
case default
ok = .false.
end select
node => null ()
end subroutine parse_ignore
@ %def parse_ignore
@ If the match fails and we cannot step back:
<<Parser: internal subroutines of [[parse_tree_init]]>>=
subroutine parse_error (rule, lexeme)
type(syntax_rule_t), intent(in) :: rule
type(lexeme_t), intent(in) :: lexeme
character(80) :: buffer
integer :: u, iostat
call lexer_show_location (lexer)
u = free_unit ()
open (u, status = "scratch")
write (u, "(A)", advance="no") "Expected syntax:"
call syntax_rule_write (rule, u)
write (u, "(A)", advance="no") "Found token:"
call lexeme_write (lexeme, u)
rewind (u)
do
read (u, "(A)", iostat=iostat) buffer
if (iostat /= 0) exit
call msg_message (trim (buffer))
end do
call msg_fatal (" Syntax error " &
// "(at or before the location indicated above)")
end subroutine parse_error
@ %def parse_error
@ The finalizer recursively deallocates all nodes and their contents.
For each node, [[parse_node_final]] is called on the sub-nodes, which in
turn deallocates the token or sub-node array contained within each of
them. At the end, only the top node remains to be deallocated.
<<Parser: public>>=
public :: parse_tree_final
<<Parser: parse tree: TBP>>=
procedure :: final => parse_tree_final
<<Parser: procedures>>=
subroutine parse_tree_final (parse_tree)
class(parse_tree_t), intent(inout) :: parse_tree
if (associated (parse_tree%root_node)) then
call parse_node_final (parse_tree%root_node)
deallocate (parse_tree%root_node)
end if
end subroutine parse_tree_final
@ %def parse_tree_final
@ Print the parse tree. Print one token per line, indented according
to the depth of the node.
The [[verbose]] version includes type identifiers for the nodes.
<<Parser: public>>=
public :: parse_tree_write
<<Parser: parse tree: TBP>>=
procedure :: write => parse_tree_write
<<Parser: procedures>>=
subroutine parse_tree_write (parse_tree, unit, verbose)
class(parse_tree_t), intent(in) :: parse_tree
integer, intent(in), optional :: unit
logical, intent(in), optional :: verbose
integer :: u
logical :: short
u = given_output_unit (unit); if (u < 0) return
short = .true.; if (present (verbose)) short = .not. verbose
write (u, "(A)") "Parse tree:"
if (associated (parse_tree%root_node)) then
call parse_node_write_rec (parse_tree%root_node, unit, short, 1)
else
write (u, *) "[empty]"
end if
end subroutine parse_tree_write
@ %def parse_tree_write
@ This is a generic error that can be issued if the parse tree does
not meet the expectaions of the parser. This most certainly indicates
a bug.
<<Parser: public>>=
public :: parse_tree_bug
<<Parser: procedures>>=
subroutine parse_tree_bug (node, keys)
type(parse_node_t), intent(in) :: node
character(*), intent(in) :: keys
call parse_node_write (node)
call msg_bug (" Inconsistency in parse tree: expected " // keys)
end subroutine parse_tree_bug
@ %def parse_tree_bug
@
\subsection{Access the parser}
For scanning the parse tree we give access to the top node, as a node
pointer. Of course, there should be no write access.
<<Parser: parse tree: TBP>>=
procedure :: get_root_ptr => parse_tree_get_root_ptr
<<Parser: procedures>>=
function parse_tree_get_root_ptr (parse_tree) result (node)
class(parse_tree_t), intent(in) :: parse_tree
type(parse_node_t), pointer :: node
node => parse_tree%root_node
end function parse_tree_get_root_ptr
@ %def parse_tree_get_root_ptr
@
\subsection{Tools}
This operation traverses the parse tree and simplifies any occurences of a
set of syntax rules. If such a parse node has only one sub-node, it is
replaced by that subnode. (This makes sense only of the rules to eliminate
have no meaningful token.)
<<Parser: public>>=
public :: parse_tree_reduce
<<Parser: procedures>>=
subroutine parse_tree_reduce (parse_tree, rule_key)
type(parse_tree_t), intent(inout) :: parse_tree
type(string_t), dimension(:), intent(in) :: rule_key
type(parse_node_t), pointer :: pn
pn => parse_tree%root_node
if (associated (pn)) then
call parse_node_reduce (pn, null(), null())
end if
contains
recursive subroutine parse_node_reduce (pn, pn_prev, pn_parent)
type(parse_node_t), intent(inout), pointer :: pn
type(parse_node_t), intent(in), pointer :: pn_prev, pn_parent
type(parse_node_t), pointer :: pn_sub, pn_sub_prev, pn_tmp
pn_sub_prev => null ()
pn_sub => pn%sub_first
do while (associated (pn_sub))
call parse_node_reduce (pn_sub, pn_sub_prev, pn)
pn_sub_prev => pn_sub
pn_sub => pn_sub%next
end do
if (parse_node_get_n_sub (pn) == 1) then
if (matches (parse_node_get_rule_key (pn), rule_key)) then
pn_tmp => pn
pn => pn%sub_first
if (associated (pn_prev)) then
pn_prev%next => pn
else if (associated (pn_parent)) then
pn_parent%sub_first => pn
else
parse_tree%root_node => pn
end if
if (associated (pn_tmp%next)) then
pn%next => pn_tmp%next
else if (associated (pn_parent)) then
pn_parent%sub_last => pn
end if
call parse_node_final (pn_tmp, recursive=.false.)
deallocate (pn_tmp)
end if
end if
end subroutine parse_node_reduce
function matches (key, key_list) result (flag)
logical :: flag
type(string_t), intent(in) :: key
type(string_t), dimension(:), intent(in) :: key_list
integer :: i
flag = .true.
do i = 1, size (key_list)
if (key == key_list(i)) return
end do
flag = .false.
end function matches
end subroutine parse_tree_reduce
@ %def parse_tree_reduce
@
\subsection{Applications}
For a file of the form
\begin{verbatim}
process foo, bar
<something>
process xyz
<something>
\end{verbatim}
get the \verb|<something>| entry node for the first matching process
tag. If no matching entry is found, the node pointer remains
unassociated.
<<Parser: public>>=
public :: parse_tree_get_process_ptr
<<Parser: procedures>>=
function parse_tree_get_process_ptr (parse_tree, process) result (node)
type(parse_node_t), pointer :: node
type(parse_tree_t), intent(in), target :: parse_tree
type(string_t), intent(in) :: process
type(parse_node_t), pointer :: node_root, node_process_def
type(parse_node_t), pointer :: node_process_phs, node_process_list
integer :: j
node_root => parse_tree%get_root_ptr ()
if (associated (node_root)) then
node_process_phs => parse_node_get_sub_ptr (node_root)
SCAN_FILE: do while (associated (node_process_phs))
node_process_def => parse_node_get_sub_ptr (node_process_phs)
node_process_list => parse_node_get_sub_ptr (node_process_def, 2)
do j = 1, parse_node_get_n_sub (node_process_list)
if (parse_node_get_string &
(parse_node_get_sub_ptr (node_process_list, j)) &
== process) then
node => parse_node_get_next_ptr (node_process_def)
return
end if
end do
node_process_phs => parse_node_get_next_ptr (node_process_phs)
end do SCAN_FILE
node => null ()
else
node => null ()
end if
end function parse_tree_get_process_ptr
@ %def parse_tree_get_process_ptr
@
\subsection{Unit tests}
Test module, followed by the corresponding implementation module.
<<[[parser_ut.f90]]>>=
<<File header>>
module parser_ut
use unit_tests
use parser_uti
<<Standard module head>>
<<Parser: public test>>
contains
<<Parser: test driver>>
end module parser_ut
@ %def parser_ut
@
<<[[parser_uti.f90]]>>=
<<File header>>
module parser_uti
use syntax_rules
use parser
<<Standard module head>>
<<Parser: test declarations>>
contains
<<Parser: tests>>
end module parser_uti
@ %def parser_ut
@ API: driver for the unit tests below.
<<Parser: public test>>=
public :: parse_test
<<Parser: test driver>>=
subroutine parse_test (u, results)
integer, intent(in) :: u
type(test_results_t), intent(inout) :: results
<<Parser: execute tests>>
end subroutine parse_test
@ %def parse_test
@ This checks the parser.
<<Parser: execute tests>>=
call test (parse_1, "parse_1", &
"check the parser", &
u, results)
<<Parser: test declarations>>=
public :: parse_1
<<Parser: tests>>=
subroutine parse_1 (u)
use ifiles
use lexers
integer, intent(in) :: u
type(ifile_t) :: ifile
type(syntax_t), target :: syntax
type(lexer_t) :: lexer
type(stream_t), target :: stream
type(parse_tree_t), target :: parse_tree
write (u, "(A)") "* Test output: Parsing"
write (u, "(A)") "* Purpose: test parse routines"
write (u, "(A)")
call ifile_append (ifile, "SEQ expr = term addition*")
call ifile_append (ifile, "SEQ addition = plus_or_minus term")
call ifile_append (ifile, "SEQ term = factor multiplication*")
call ifile_append (ifile, "SEQ multiplication = times_or_over factor")
call ifile_append (ifile, "SEQ factor = atom exponentiation*")
call ifile_append (ifile, "SEQ exponentiation = '^' atom")
call ifile_append (ifile, "ALT atom = real | delimited_expr")
call ifile_append (ifile, "GRO delimited_expr = ( expr )")
call ifile_append (ifile, "ALT plus_or_minus = '+' | '-'")
call ifile_append (ifile, "ALT times_or_over = '*' | '/'")
call ifile_append (ifile, "KEY '+'")
call ifile_append (ifile, "KEY '-'")
call ifile_append (ifile, "KEY '*'")
call ifile_append (ifile, "KEY '/'")
call ifile_append (ifile, "KEY '^'")
call ifile_append (ifile, "REA real")
write (u, "(A)") "* File contents (syntax definition):"
call ifile_write (ifile, u)
write (u, "(A)") "EOF"
write (u, "(A)")
call syntax_init (syntax, ifile)
call ifile_final (ifile)
call syntax_write (syntax, u)
write (u, "(A)")
call lexer_init (lexer, &
comment_chars = "", &
quote_chars = "'", &
quote_match = "'", &
single_chars = "+-*/^()", &
special_class = [""] , &
keyword_list = syntax_get_keyword_list_ptr (syntax))
call lexer_write_setup (lexer, u)
write (u, "(A)")
call ifile_append (ifile, "(27+8^3-2/3)*(4+7)^2*99")
write (u, "(A)") "* File contents (input file):"
call ifile_write (ifile, u)
write (u, "(A)") "EOF"
print *
call stream_init (stream, ifile)
call lexer_assign_stream (lexer, stream)
call parse_tree_init (parse_tree, syntax, lexer)
call stream_final (stream)
call parse_tree_write (parse_tree, u, .true.)
print *
write (u, "(A)") "* Cleanup, everything should now be empty:"
write (u, "(A)")
call parse_tree_final (parse_tree)
call parse_tree_write (parse_tree, u, .true.)
write (u, "(A)")
call lexer_final (lexer)
call lexer_write_setup (lexer, u)
write (u, "(A)")
call ifile_final (ifile)
write (u, "(A)") "* File contents:"
call ifile_write (ifile, u)
write (u, "(A)") "EOF"
write (u, "(A)")
call syntax_final (syntax)
call syntax_write (syntax, u)
write (u, "(A)")
write (u, "(A)") "* Test output end: parser_1"
end subroutine parse_1
@ %def parse_1
@
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{XML Parser}
The XML parser is actually independent from the previous modules of
lexer and parser. However, for a generic I/O interface we make use of
the [[stream_t]] facility.
We need the XML parser for reading and writing LHEF data files. Only
a subset of XML is actually relevant. The parser is of the ``pull''
type, i.e., the program steers the reading of XML data in a
context-sensitive manner.
<<[[xml.f90]]>>=
<<File header>>
module xml
<<Use strings>>
use io_units
use system_defs, only: BLANK, TAB
use diagnostics
use ifiles
use lexers
<<Standard module head>>
<<XML: public>>
<<XML: types>>
contains
<<XML: procedures>>
end module xml
@ %def xml
@
\subsection{Cached Stream}
The stream type as defined in the [[lexer]] module is versatile
regarding the choice of input channel, but it does not allow reading a
section more than once. Here, we define an extension where we can
return a string to the stream, which is stored in a cache variable,
and presented to the caller for the next read.
<<XML: public>>=
public :: cstream_t
<<XML: types>>=
type, extends (stream_t) :: cstream_t
logical :: cache_is_empty = .true.
type(string_t) :: cache
contains
<<XML: cstream: TBP>>
end type cstream_t
@ %def cached_stream
@ The initializers are simply inherited.
Finalizer: also inherited, in essence:
<<XML: cstream: TBP>>=
procedure :: final => cstream_final
<<XML: procedures>>=
subroutine cstream_final (stream)
class(cstream_t), intent(inout) :: stream
stream%cache_is_empty = .true.
call stream%stream_t%final ()
end subroutine cstream_final
@ %def cstream_final
@ Get record: now, if there is a cache string, return this instead of
the record from the stream.
<<XML: cstream: TBP>>=
procedure :: get_record => cstream_get_record
<<XML: procedures>>=
subroutine cstream_get_record (cstream, string, iostat)
class(cstream_t), intent(inout) :: cstream
type(string_t), intent(out) :: string
integer, intent(out) :: iostat
if (cstream%cache_is_empty) then
call stream_get_record (cstream%stream_t, string, iostat)
else
string = cstream%cache
cstream%cache_is_empty = .true.
iostat = 0
end if
end subroutine cstream_get_record
@ %def cstream_get_record
@ Revert: return the (partially read) record to the stream, putting it
in the cache.
<<XML: cstream: TBP>>=
procedure :: revert_record => cstream_revert_record
<<XML: procedures>>=
subroutine cstream_revert_record (cstream, string)
class(cstream_t), intent(inout) :: cstream
type(string_t), intent(in) :: string
if (cstream%cache_is_empty) then
cstream%cache = string
cstream%cache_is_empty = .false.
else
call msg_bug ("CStream: attempt to revert twice")
end if
end subroutine cstream_revert_record
@ %def cstream_revert_record
@
\subsection{Attributes}
A tag attribute has a name and a value; both are strings. When the
attribute is defined, the [[known]] flag indicates this.
<<XML: types>>=
type :: attribute_t
type(string_t) :: name
type(string_t) :: value
logical :: known = .false.
contains
<<XML: attribute: TBP>>
end type attribute_t
@ %def attribute_t
@ Output in standard format, non-advancing. (If the value is unknown,
we indicate this by a question mark, which is non-standard.)
<<XML: attribute: TBP>>=
procedure :: write => attribute_write
<<XML: procedures>>=
subroutine attribute_write (object, unit)
class(attribute_t), intent(in) :: object
integer, intent(in), optional :: unit
integer :: u
u = given_output_unit (unit)
write (u, "(A,'=')", advance = "no") char (object%name)
if (object%known) then
write (u, "(A,A,A)", advance = "no") '"', char (object%value), '"'
else
write (u, "('?')", advance = "no")
end if
end subroutine attribute_write
@ %def attribute_write
@ This is a genuine constructor. The value is optional.
<<XML: public>>=
public :: xml_attribute
<<XML: procedures>>=
function xml_attribute (name, value) result (attribute)
type(string_t), intent(in) :: name
type(string_t), intent(in), optional :: value
type(attribute_t) :: attribute
attribute%name = name
if (present (value)) then
attribute%value = value
attribute%known = .true.
else
attribute%known = .false.
end if
end function xml_attribute
@ %def xml_attribute
@ Set a value explicitly.
<<XML: attribute: TBP>>=
procedure :: set_value => attribute_set_value
<<XML: procedures>>=
subroutine attribute_set_value (attribute, value)
class(attribute_t), intent(inout) :: attribute
type(string_t), intent(in) :: value
attribute%value = value
attribute%known = .true.
end subroutine attribute_set_value
@ %def attribute_set_value
@ Extract a value. If unset, return [["?"]]
<<XML: attribute: TBP>>=
procedure :: get_value => attribute_get_value
<<XML: procedures>>=
function attribute_get_value (attribute) result (value)
class(attribute_t), intent(in) :: attribute
type(string_t) :: value
if (attribute%known) then
value = attribute%value
else
value = "?"
end if
end function attribute_get_value
@ %def attribute_get_value
@
\subsection{The Tag Type}
The basic entity is the internal representation of an XML tag. The
tag has a name, a well-defined set of attributes which may be
mandatory or optional, and a flag that determines whether there is
content or not. The content itself is not stored in the data structure.
<<XML: public>>=
public :: xml_tag_t
<<XML: types>>=
type :: xml_tag_t
type(string_t) :: name
type(attribute_t), dimension(:), allocatable :: attribute
logical :: has_content = .false.
contains
<<XML: tag: TBP>>
end type xml_tag_t
@ %def xml_tag_t
@ Initialization. There are different versions, depending on content.
<<XML: tag: TBP>>=
generic :: init => init_no_attributes
procedure :: init_no_attributes => xml_tag_init_no_attributes
<<XML: procedures>>=
subroutine xml_tag_init_no_attributes (tag, name, has_content)
class(xml_tag_t), intent(out) :: tag
type(string_t), intent(in) :: name
logical, intent(in), optional :: has_content
tag%name = name
allocate (tag%attribute (0))
if (present (has_content)) tag%has_content = has_content
end subroutine xml_tag_init_no_attributes
@ %def xml_tag_init_no_attributes
@ This version sets attributes.
<<XML: tag: TBP>>=
generic :: init => init_with_attributes
procedure :: init_with_attributes => xml_tag_init_with_attributes
<<XML: procedures>>=
subroutine xml_tag_init_with_attributes (tag, name, attribute, has_content)
class(xml_tag_t), intent(out) :: tag
type(string_t), intent(in) :: name
type(attribute_t), dimension(:), intent(in) :: attribute
logical, intent(in), optional :: has_content
tag%name = name
allocate (tag%attribute (size (attribute)))
tag%attribute = attribute
if (present (has_content)) tag%has_content = has_content
end subroutine xml_tag_init_with_attributes
@ %def xml_tag_init_with_attributes
@ Set an attribute value explicitly.
<<XML: tag: TBP>>=
procedure :: set_attribute => xml_tag_set_attribute
<<XML: procedures>>=
subroutine xml_tag_set_attribute (tag, i, value)
class(xml_tag_t), intent(inout) :: tag
integer, intent(in) :: i
type(string_t), intent(in) :: value
call tag%attribute(i)%set_value (value)
end subroutine xml_tag_set_attribute
@ %def xml_tag_set_attribute
@ Get an attribute value.
<<XML: tag: TBP>>=
procedure :: get_attribute => xml_tag_get_attribute
<<XML: procedures>>=
function xml_tag_get_attribute (tag, i) result (value)
class(xml_tag_t), intent(in) :: tag
integer, intent(in) :: i
type(string_t) :: value
value = tag%attribute(i)%get_value ()
end function xml_tag_get_attribute
@ %def xml_tag_set_attribute
@ Output to an I/O unit, default STDOUT. We use non-advancing output.
<<XML: tag: TBP>>=
generic :: write => write_without_content
procedure :: write_without_content => xml_tag_write
<<XML: procedures>>=
subroutine xml_tag_write (tag, unit)
class(xml_tag_t), intent(in) :: tag
integer, intent(in), optional :: unit
integer :: u, i
u = given_output_unit (unit)
write (u, "('<',A)", advance = "no") char (tag%name)
do i = 1, size (tag%attribute)
write (u, "(1x)", advance = "no")
call tag%attribute(i)%write (u)
end do
if (tag%has_content) then
write (u, "('>')", advance = "no")
else
write (u, "(' />')", advance = "no")
end if
end subroutine xml_tag_write
@ %def xml_tag_write
@ If there is content, we should write the context next (arbitrary
format), the write the corresponding closing tag. Again, non-advancing.
<<XML: tag: TBP>>=
procedure :: close => xml_tag_close
<<XML: procedures>>=
subroutine xml_tag_close (tag, unit)
class(xml_tag_t), intent(in) :: tag
integer, intent(in), optional :: unit
integer :: u
u = given_output_unit (unit)
write (u, "('</',A,'>')", advance = "no") char (tag%name)
end subroutine xml_tag_close
@ %def xml_tag_close
@ Given content as a single string, we can write tag, content, and
closing at once
<<XML: tag: TBP>>=
generic :: write => write_with_content
procedure :: write_with_content => xml_tag_write_with_content
<<XML: procedures>>=
subroutine xml_tag_write_with_content (tag, content, unit)
class(xml_tag_t), intent(in) :: tag
type(string_t), intent(in) :: content
integer, intent(in), optional :: unit
integer :: u
u = given_output_unit (unit)
call tag%write (u)
write (u, "(A)", advance = "no") char (content)
call tag%close (u)
end subroutine xml_tag_write_with_content
@ %def xml_tag_write_with_content
@ Input from stream. We know what we are looking for, so we check if
the name matches, then fill attributes. We report an error if (a) an
I/O error occurs, (b) we reach EOF before encountering the tag, (c)
if the tag is incomplete.
Trailing text after reading a tag is put back to the input stream.
We assume that the tag is not broken across records, and that there is
only one tag within the record. This is more restricted than standard
XML.
<<XML: tag: TBP>>=
procedure :: read => xml_tag_read
<<XML: procedures>>=
subroutine xml_tag_read (tag, cstream, success)
class(xml_tag_t), intent(inout) :: tag
type(cstream_t), intent(inout) :: cstream
logical, intent(out) :: success
type(string_t) :: string
integer :: iostat, p1, p2
character(2), parameter :: WS = BLANK // TAB
logical :: done
! Skip comments and blank lines
FIND_NON_COMMENT: do
FIND_NONEMPTY_RECORD: do
call cstream%get_record (string, iostat)
if (iostat /= 0) call err_io ()
p1 = verify (string, WS)
if (p1 > 0) exit FIND_NONEMPTY_RECORD
end do FIND_NONEMPTY_RECORD
! Look for comment beginning
p2 = p1 + 3
if (extract (string, p1, p2) /= "<!--") exit FIND_NON_COMMENT
! Look for comment end, then restart
string = extract (string, p2 + 1)
FIND_COMMENT_END: do
do p1 = 1, len (string) - 2
p2 = p1 + 2
if (extract (string, p1, p2) == "-->") then
! Return trailing text to the stream
string = extract (string, p2 + 1)
if (string /= "") call cstream%revert_record (string)
exit FIND_COMMENT_END
end if
end do
call cstream%get_record (string, iostat)
if (iostat /= 0) call err_io ()
end do FIND_COMMENT_END
end do FIND_NON_COMMENT
! Look for opening <
p2 = p1
if (extract (string, p1, p2) /= "<") then
call cstream%revert_record (string)
success = .false.; return
else
! Look for tag name
string = extract (string, p2 + 1)
p1 = verify (string, WS); if (p1 == 0) call err_incomplete ()
p2 = p1 + len (tag%name) - 1
if (extract (string, p1, p2) /= tag%name) then
call cstream%revert_record ("<" // string)
success = .false.; return
else
! Look for attributes
string = extract (string, p2 + 1)
READ_ATTRIBUTES: do
call tag%read_attribute (string, done)
if (done) exit READ_ATTRIBUTES
end do READ_ATTRIBUTES
! Look for closing >
p1 = verify (string, WS); if (p1 == 0) call err_incomplete ()
p2 = p1
if (extract (string, p1, p1) == ">") then
tag%has_content = .true.
else
! Look for closing />
p2 = p1 + 1
if (extract (string, p1, p2) /= "/>") call err_incomplete ()
end if
! Return trailing text to the stream
string = extract (string, p2 + 1)
if (string /= "") call cstream%revert_record (string)
success = .true.
end if
end if
contains
subroutine err_io ()
select case (iostat)
case (:-1)
call msg_fatal ("XML: Error reading tag '" // char (tag%name) &
// "': end of file")
case (1:)
call msg_fatal ("XML: Error reading tag '" // char (tag%name) &
// "': I/O error")
end select
success = .false.
end subroutine err_io
subroutine err_incomplete ()
call msg_fatal ("XML: Error reading tag '" // char (tag%name) &
// "': tag incomplete")
success = .false.
end subroutine err_incomplete
end subroutine xml_tag_read
@ %def xml_tag_read
@ Read a single attribute. If the attribute is valid, assign the
value. Setting a value twice should be an error, but is not detected.
If the attribute is unknown, ignore it. If we reach the closing sign,
report this.
<<XML: tag: TBP>>=
procedure :: read_attribute => xml_tag_read_attribute
<<XML: procedures>>=
subroutine xml_tag_read_attribute (tag, string, done)
class(xml_tag_t), intent(inout) :: tag
type(string_t), intent(inout) :: string
logical, intent(out) :: done
character(2), parameter :: WS = BLANK // TAB
type(string_t) :: name, value
integer :: p1, p2, i
p1 = verify (string, WS); if (p1 == 0) call err ()
p2 = p1
! Look for first terminating '>' or '/>'
if (extract (string, p1, p2) == ">") then
done = .true.
else
p2 = p1 + 1
if (extract (string, p1, p2) == "/>") then
done = .true.
else
! Look for '='
p2 = scan (string, '=')
if (p2 == 0) call err ()
name = trim (extract (string, p1, p2 - 1))
! Look for '"'
string = extract (string, p2 + 1)
p1 = verify (string, WS); if (p1 == 0) call err ()
p2 = p1
if (extract (string, p1, p2) /= '"') call err ()
! Look for matching '"' and get value
string = extract (string, p2 + 1)
p1 = 1
p2 = scan (string, '"')
if (p2 == 0) call err ()
value = extract (string, p1, p2 - 1)
SCAN_KNOWN_ATTRIBUTES: do i = 1, size (tag%attribute)
if (name == tag%attribute(i)%name) then
call tag%attribute(i)%set_value (value)
exit SCAN_KNOWN_ATTRIBUTES
end if
end do SCAN_KNOWN_ATTRIBUTES
string = extract (string, p2 + 1)
done = .false.
end if
end if
contains
subroutine err ()
call msg_fatal ("XML: Error reading attributes of '" // char (tag%name) &
// "': syntax error")
end subroutine err
end subroutine xml_tag_read_attribute
@ %def xml_tag_read_attribute
@ Read the content string of a tag. We check for the appropriate
closing tag and report it. If a closing tag does not match in name,
ignore it.
Note: this assumes that no tag with the same name is embedded in the
current content. Also, we do not check for XML validity inside the
content.
<<XML: tag: TBP>>=
procedure :: read_content => xml_tag_read_content
<<XML: procedures>>=
subroutine xml_tag_read_content (tag, cstream, content, closing)
class(xml_tag_t), intent(in) :: tag
type(cstream_t), intent(inout) :: cstream
type(string_t), intent(out) :: content
type(string_t) :: string
logical, intent(out) :: closing
integer :: iostat
integer :: p0, p1, p2
character(2), parameter :: WS = BLANK // TAB
call cstream%get_record (content, iostat)
if (iostat /= 0) call err_io ()
closing = .false.
FIND_CLOSING: do p0 = 1, len (content) - 1
! Look for terminating </
p1 = p0
p2 = p1 + 1
if (extract (content, p1, p2) == "</") then
! Look for closing tag name
string = extract (content, p2 + 1)
p1 = verify (string, WS); if (p1 == 0) call err_incomplete ()
p2 = p1 + len (tag%name) - 1
if (extract (string, p1, p2) == tag%name) then
! Tag name matches: look for final >
string = extract (string, p2 + 1)
p1 = verify (string, WS); if (p1 == 0) call err_incomplete ()
p2 = p1
if (extract (string, p1, p2) /= ">") call err_incomplete ()
! Return trailing text to the stream
string = extract (string, p2 + 1)
if (string /= "") call cstream%revert_record (string)
content = extract (content, 1, p0 -1)
closing = .true.
exit FIND_CLOSING
end if
end if
end do FIND_CLOSING
contains
subroutine err_io ()
select case (iostat)
case (:-1)
call msg_fatal ("XML: Error reading content of '" // char (tag%name) &
// "': end of file")
case (1:)
call msg_fatal ("XML: Error reading content of '" // char (tag%name) &
// "': I/O error")
end select
closing = .false.
end subroutine err_io
subroutine err_incomplete ()
call msg_fatal ("XML: Error reading content '" // char (tag%name) &
// "': closing tag incomplete")
closing = .false.
end subroutine err_incomplete
end subroutine xml_tag_read_content
@ %def xml_tag_read_content
@
\subsection{Unit tests}
Test module, followed by the corresponding implementation module.
<<[[xml_ut.f90]]>>=
<<File header>>
module xml_ut
use unit_tests
use xml_uti
<<Standard module head>>
<<XML: public test>>
contains
<<XML: test driver>>
end module xml_ut
@ %def xml_ut
@
<<[[xml_uti.f90]]>>=
<<File header>>
module xml_uti
<<Use strings>>
use io_units
use xml
<<Standard module head>>
<<XML: test declarations>>
contains
<<XML: tests>>
<<XML: test auxiliary>>
end module xml_uti
@ %def xml_ut
@ API: driver for the unit tests below.
<<XML: public test>>=
public :: xml_test
<<XML: test driver>>=
subroutine xml_test (u, results)
integer, intent(in) :: u
type(test_results_t), intent(inout) :: results
<<XML: execute tests>>
end subroutine xml_test
@ %def xml_test
@
\subsection{Auxiliary Routines}
Show the contents of a temporary file, i.e., open unit.
<<XML: test auxiliary>>=
subroutine show (u_tmp, u)
integer, intent(in) :: u_tmp, u
character (80) :: buffer
integer :: iostat
write (u, "(A)") "File content:"
rewind (u_tmp)
do
read (u_tmp, "(A)", iostat = iostat) buffer
if (iostat /= 0) exit
write (u, "(A)") trim (buffer)
end do
rewind (u_tmp)
end subroutine show
@ %def show
@
\subsection{Basic Tag I/O}
Write a tag and read it again, using a temporary file.
<<XML: execute tests>>=
call test (xml_1, "xml_1", &
"basic I/O", &
u, results)
<<XML: test declarations>>=
public :: xml_1
<<XML: tests>>=
subroutine xml_1 (u)
integer, intent(in) :: u
type(xml_tag_t), allocatable :: tag
integer :: u_tmp
type(cstream_t) :: cstream
logical :: success
write (u, "(A)") "* Test output: xml_1"
write (u, "(A)") "* Purpose: write and read tag"
write (u, "(A)")
write (u, "(A)") "* Empty tag"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%write (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, *)
write (u, "(A)") "* Tag with preceding blank lines"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"))
write (u_tmp, *)
write (u_tmp, "(A)") " "
write (u_tmp, "(2x)", advance = "no")
call tag%write (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, *)
write (u, "(A)") "* Tag with preceding comments"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"))
write (u_tmp, "(A)") "<!-- comment -->"
write (u_tmp, *)
write (u_tmp, "(A)") "<!-- multiline"
write (u_tmp, "(A)") " comment -->"
call tag%write (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
close (u_tmp)
deallocate (tag)
call cstream%final ()
write (u, *)
write (u, "(A)") "* Tag with name mismatch"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("wrongname"))
call tag%write (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, "(A)")
write (u, "(A)") "* Test output end: xml_1"
end subroutine xml_1
@ %def xml_1
@
\subsection{Optional Tag}
Write and read two tags, one of them optional.
<<XML: execute tests>>=
call test (xml_2, "xml_2", &
"optional tag", &
u, results)
<<XML: test declarations>>=
public :: xml_2
<<XML: tests>>=
subroutine xml_2 (u)
integer, intent(in) :: u
type(xml_tag_t), allocatable :: tag1, tag2
integer :: u_tmp
type(cstream_t) :: cstream
logical :: success
write (u, "(A)") "* Test output: xml_2"
write (u, "(A)") "* Purpose: handle optional tag"
write (u, "(A)")
write (u, "(A)") "* Optional tag present"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag1)
call tag1%init (var_str ("option"))
call tag1%write (u_tmp)
write (u_tmp, *)
allocate (tag2)
call tag2%init (var_str ("tagname"))
call tag2%write (u_tmp)
write (u_tmp, *)
deallocate (tag1, tag2)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag1)
call tag1%init (var_str ("option"))
call tag1%read (cstream, success)
call tag1%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
write (u, *)
allocate (tag2)
call tag2%init (var_str ("tagname"))
call tag2%read (cstream, success)
call tag2%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
deallocate (tag1, tag2)
close (u_tmp)
call cstream%final ()
write (u, *)
write (u, "(A)") "* Optional tag absent"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag2)
call tag2%init (var_str ("tagname"))
call tag2%write (u_tmp)
write (u_tmp, *)
deallocate (tag2)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag1)
call tag1%init (var_str ("option"))
call tag1%read (cstream, success)
call tag1%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
write (u, *)
allocate (tag2)
call tag2%init (var_str ("tagname"))
call tag2%read (cstream, success)
call tag2%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
deallocate (tag1, tag2)
close (u_tmp)
call cstream%final ()
write (u, "(A)")
write (u, "(A)") "* Test output end: xml_2"
end subroutine xml_2
@ %def xml_2
@
\subsection{Optional Tag}
Write and read a tag with single-line content.
<<XML: execute tests>>=
call test (xml_3, "xml_3", &
"content", &
u, results)
<<XML: test declarations>>=
public :: xml_3
<<XML: tests>>=
subroutine xml_3 (u)
integer, intent(in) :: u
type(xml_tag_t), allocatable :: tag
integer :: u_tmp
type(cstream_t) :: cstream
logical :: success, closing
type(string_t) :: content
write (u, "(A)") "* Test output: xml_3"
write (u, "(A)") "* Purpose: handle tag with content"
write (u, "(A)")
write (u, "(A)") "* Tag without content"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%write (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
write (u, "(A,L1)") "content = ", tag%has_content
write (u, *)
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, "(A)") "* Tag with content"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"), has_content = .true.)
call tag%write (var_str ("Content text"), u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%read_content (cstream, content, closing)
call tag%write (u)
write (u, "(A)", advance = "no") char (content)
call tag%close (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
write (u, "(A,L1)") "content = ", tag%has_content
write (u, "(A,L1)") "closing = ", closing
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, *)
write (u, "(A)") "* Tag with multiline content"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"), has_content = .true.)
call tag%write (u_tmp)
write (u_tmp, *)
write (u_tmp, "(A)") "Line 1"
write (u_tmp, "(A)") "Line 2"
call tag%close (u_tmp)
write (u_tmp, *)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"))
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
do
call tag%read_content (cstream, content, closing)
if (closing) exit
write (u, "(A)") char (content)
end do
call tag%close (u)
write (u, *)
write (u, "(A,L1)") "success = ", success
write (u, "(A,L1)") "content = ", tag%has_content
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, "(A)")
write (u, "(A)") "* Test output end: xml_3"
end subroutine xml_3
@ %def xml_3
@
\subsection{Basic Tag I/O}
Write a tag and read it again, using a temporary file.
<<XML: execute tests>>=
call test (xml_4, "xml_4", &
"attributes", &
u, results)
<<XML: test declarations>>=
public :: xml_4
<<XML: tests>>=
subroutine xml_4 (u)
integer, intent(in) :: u
type(xml_tag_t), allocatable :: tag
integer :: u_tmp
type(cstream_t) :: cstream
logical :: success
write (u, "(A)") "* Test output: xml_4"
write (u, "(A)") "* Purpose: handle tag with attributes"
write (u, "(A)")
write (u, "(A)") "* Tag with one mandatory and one optional attribute,"
write (u, "(A)") "* unknown attribute ignored"
write (u, *)
u_tmp = free_unit ()
open (u_tmp, status = "scratch", action = "readwrite")
allocate (tag)
call tag%init (var_str ("tagname"), &
[xml_attribute (var_str ("a1"), var_str ("foo")), &
xml_attribute (var_str ("a3"), var_str ("gee"))])
call tag%write (u_tmp)
deallocate (tag)
call show (u_tmp, u)
write (u, *)
write (u, "(A)") "Result from read:"
call cstream%init (u_tmp)
allocate (tag)
call tag%init (var_str ("tagname"), &
[xml_attribute (var_str ("a1")), &
xml_attribute (var_str ("a2"), var_str ("bar"))])
call tag%read (cstream, success)
call tag%write (u)
write (u, *)
deallocate (tag)
close (u_tmp)
call cstream%final ()
write (u, "(A)")
write (u, "(A)") "* Test output end: xml_4"
end subroutine xml_4
@ %def xml_4
File Metadata
Details
Attached
Mime Type
text/x-tex
Expires
Sat, May 3, 6:34 AM (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4983085
Default Alt Text
parsing.nw (174 KB)
Attached To
rWHIZARDSVN whizardsvn
Event Timeline
Log In to Comment