(* * This file implements operations on Determistic Finite Automata (DFAs). * * ---------------------------------------------------------------- * * @begin[license] * Copyright (C) 2002 Jason Hickey, Caltech * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Jason Hickey * @email{jyh@cs.caltech.edu} * @end[license] *) open Format (************************************************************************ * Modules and type definitions. *) (* * The data structure that represents a set of integers. * We'll use small integers only, so the (-) function is * a valid comparison. *) module StateCompare = struct type t = int let compare = (-) end module StateSet = Set.Make (StateCompare) type state = int type state_set = StateSet.t (* * We use the ASCII characters as the symbols * in this lab. *) type symbol = char module SymbolCompare = struct type t = symbol let compare = Pervasives.compare end module SymbolSet = Set.Make (SymbolCompare) (* * For this lab, we'll use a simplified representation of * automata. The states of a DFA are finite sets of integers, * and the alphabet is always char. The reason why we do this * is mainly to avoid the module system. We want to be able * to build DFAs at run-time. * * In this simplified representation, a DFA is a 3-tuple: * dfa_delta : the transition function. * dfa_start : the start state * dfa_is_final : a final state predicate. *) type dfa = { dfa_delta : state_set -> symbol -> state_set; dfa_start : state_set; dfa_is_final : state_set -> bool } (* * We also use the simplified representation for NFAs. * * Formally, an NFA with epsilon-transitions is a 6-tuple: * state : the type of states. * symbol : the type of symbols. * delta_normal : the transition relation. To simplify the encoding, * the relation is expressed as a function that produces * a state set. * delta_epsilon : the epsilon-transitions, again expressed * as a state set. * start : the start state. * is_final : a final state predicate. * * In the simplified representation, we make the following assumptions: * 1. The states are integers in the range [0, nfa_states) * 2. The symbols are the "symbol" type (char) * 3. The start state is always state 0 *) type nfa = { nfa_states : state; nfa_delta_normal : state -> symbol -> state_set; nfa_delta_epsilon : state -> state_set; nfa_is_final : state -> bool } (* * The type of regular expressions. * * Empty == {} * Epsilon == { epsilon } * Concat (s, t) == st * Choice (s, t) == s + t * Kleene s == s* * * We optimize symbols slightly. * We allow a set of symbols in the regex: * Symbols { a1, ..., an } == (a1 + ... + an) *) type regex = Empty | Epsilon | Symbols of SymbolSet.t | Concat of regex * regex | Choice of regex * regex | Kleene of regex (************************************************************************ * TODO: produce a DFA from an NFA. *) let dfa_of_nfa nfa = raise (Failure "not implemented") (************************************************************************ * Regular expressions. * * In this section, we build NFAs from regular expressions. *) (* * The NFA for the empty language. *) let regex_empty = { nfa_states = 1; nfa_delta_normal = (fun _ _ -> StateSet.empty); nfa_delta_epsilon = (fun _ -> StateSet.empty); nfa_is_final = (fun _ -> false) } (* * The NFA that accepts the empty string. *) let regex_epsilon = { nfa_states = 1; nfa_delta_normal = (fun _ _ -> StateSet.empty); nfa_delta_epsilon = (fun _ -> StateSet.empty); nfa_is_final = (fun q -> q = 0) } (* * TODO: the NFA that accepts a single symbol. *) let regex_symbol_set c_set = raise (Failure "not implemented") (* * TODO: build the concatenation NFA. *) let regex_concat nfa1 nfa2 = raise (Failure "not implemented") (* * TODO: build the union NFA. *) let regex_choice nfa1 nfa2 = raise (Failure "not implemented") (* * TODO: build the Kleene closure NFA. *) let regex_kleene nfa = raise (Failure "not implemented") (* * Compile the regular expression to a NFA description. *) let rec nfa_of_regex regex = match regex with Empty -> regex_empty | Epsilon -> regex_epsilon | Symbols c_set -> regex_symbol_set c_set | Concat (r1, r2) -> regex_concat (nfa_of_regex r1) (nfa_of_regex r2) | Choice (r1, r2) -> regex_choice (nfa_of_regex r1) (nfa_of_regex r2) | Kleene r -> regex_kleene (nfa_of_regex r) (************************************************************************ * TEST PROGRAMS * * For the lab, you don't need to know the test program. * In fact, it would not normally be included in this file, * but we wanted to make compiling easier by putting everything * in one file. * * What we want to do is scan the input against multiple * regular expressions. The expressions are first compiled * to DFAs. Then we scan the input one character at a time * to see if some prefix matches against one of the regular * expressions. *) (************************************************************************ * Printing routines. You may want to use these for debugging. * You can print out a representation for a regular expression, * and for an NFA, but there is no printer currently for a DFA. *) let rec pp_print_regex buf regex = match regex with Empty -> pp_print_string buf "\\empty" | Epsilon -> pp_print_string buf "\\epsilon" | Symbols c_set -> fprintf buf "[@["; SymbolSet.iter (fun c -> fprintf buf "@ %s" (Char.escaped c)) c_set; fprintf buf "@]]" | Concat (r1, r2) -> fprintf buf "%a%a" pp_print_regex r1 pp_print_regex r2 | Choice (r1, r2) -> fprintf buf "(@[%a@ + %a@])" pp_print_regex r1 pp_print_regex r2 | Kleene r -> fprintf buf "(%a)*" pp_print_regex r let pp_print_state_set buf q_set = StateSet.iter (fun q -> fprintf buf "@ %d" q) q_set let pp_print_nfa buf nfa = let { nfa_states = states; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } = nfa in fprintf buf "@[@[NFA {@ states = %d;" states; for q = 0 to pred states do fprintf buf "@ @[@[state[%d] {" q; fprintf buf "@ @[@[normal {"; for c = 0 to 255 do let c = Char.chr c in let q_set = delta_normal q c in if not (StateSet.is_empty q_set) then fprintf buf "@ '%s' -> %a" (Char.escaped c) pp_print_state_set q_set done; fprintf buf "@]@ }@]@ @[epsilon = %a@];" pp_print_state_set (delta_epsilon q); if is_final q then fprintf buf "@ final"; fprintf buf "@]@ }@]" done; fprintf buf "@]@ }@]" (************************************************************************ * Stream inputs. We basically cheat herew, and treat the input * file a one big string. These routines read the entire file, * turn it into a string, then provide functional access routines * to access the string. *) type stream = { offset : int; buffer : string } let stream_of_file name = let s = try let inx = open_in name in let buf = Buffer.create 1024 in let rec loop () = let eof = try let c = input_char inx in Buffer.add_char buf c; false with End_of_file -> true in if not eof then loop () in loop (); close_in inx; Buffer.contents buf with Sys_error _ -> eprintf "Can't read file %s@." name; exit 2 in { offset = 0; buffer = s } (* * Test for end-of-file. *) let stream_eof { offset = off; buffer = buf } = off = String.length buf (* * Get the next character from the stream, *) let stream_get stream = let { offset = off; buffer = buf } = stream in let _ = if off = String.length buf then raise End_of_file in let c = buf.[off] in let stream = { stream with offset = succ off } in stream, c (* * Get the substring represented by two streams. * We assume they are valid and point to the same buffer. *) let stream_substring { offset = off1 } { offset = off2; buffer = buf } = String.sub buf off1 (off2 - off1) (************************************************************************ * Pattern matching. * * Simulate the DFA to determine if the DFA matches some * prefix of the stream. Raises Not_found if the DFA does * not match. *) let dfa_match dfa stream = let { dfa_delta = delta; dfa_start = start; dfa_is_final = is_final } = dfa in (* * Simulate the DFA. * q: the current state * result: the stream at the last final state * stream: the current input stream * * Returns a stream option that represents the * stream for the longest match. *) let rec simulate q result stream = (* * If this is a final state, then remember it as the result. *) let result = if is_final q then Some stream else result in (* * If the state is empty, then the machine * has rejected the input string. *) if StateSet.is_empty q then result else try let stream, c = stream_get stream in let q = dfa.dfa_delta q c in simulate q result stream with End_of_file -> result in simulate start None stream (* * Match the stream against a set of expressions. * Return the first match, or raise Not_found if * there is no match. *) let dfa_match_list dfas stream = let rec search dfas = match dfas with dfa :: dfas -> (match dfa_match dfa stream with Some stream' -> (* This machine matched, extract the substring *) let s = stream_substring stream stream' in stream', s | None -> (* Try the next one *) search dfas) | [] -> (* None of the patterns matched *) raise Not_found in search dfas (* * Parse all strings in the input against the DFA list. * For each input expression that matches, print the string * that was matched. *) let parse_file dfas file = let rec parse stream = if not (stream_eof stream) then try let stream, s = dfa_match_list dfas stream in printf "Input symbol: \"%s\"@." (String.escaped s); parse stream with Not_found -> let _, c = stream_get stream in eprintf "Syntax error: char %d = '%s'@." stream.offset (Char.escaped c); exit 1 | End_of_file -> () in parse (stream_of_file file) (************************************************************************ * Produce a regex from a string. You aren't supposed to realize * this yet, but we are effectively using a pushdown automaton to * turn the string into a regular expression. * * Firt, to implement the complement symbol set, we need the set * of all characters in the alphabet. *) let all_symbols = let rec loop s i = if i > 255 then s else loop (SymbolSet.add (Char.chr i) s) (succ i) in loop SymbolSet.empty 0 (* * Build a regex expression from a string. *) let regex_of_string s = let len = String.length s in (* Concatenate all the regular expressions in the list in reverse order *) let rec concat stack = match stack with [] -> Empty | [r] -> r | r1 :: stack -> Concat (concat stack, r1) in (* The scanner is a stack machine *) let rec scan stack i = if i >= len then stack, i else match s.[i] with '[' -> let stack, i = scan_range stack (succ i) in scan stack i | '+' -> let stack, i = scan stack (succ i) in let stack = match stack with r1 :: r2 :: stack -> Choice (r1, r2) :: stack | _ -> raise (Failure "+ operator in regex") in scan stack i | '*' -> let stack = match stack with r :: stack -> Kleene r :: stack | _ -> raise (Failure "* operator in regex") in scan stack (succ i) | '(' -> let stack', i = scan [] (succ i) in let stack = concat stack' :: stack in scan stack (succ i) | c -> let stack = Symbols (SymbolSet.singleton c) :: stack in scan stack (succ i) and scan_range stack i = if i = len then stack, i else if s.[i] = '^' then let set, i = scan_chars SymbolSet.empty (succ i) in let stack = Symbols (SymbolSet.diff all_symbols set) :: stack in stack, i else let set, i = scan_chars SymbolSet.empty i in let stack = Symbols set :: stack in stack, i and scan_chars set i = if i = len then set, i else let c = s.[i] in if c = ']' then set, succ i else scan_chars (SymbolSet.add c set) (succ i) in (* Now parse the input string *) let stack, _ = scan [] 0 in concat stack (* * Compile the string representing the regular expression * to a DFA. *) let dfa_of_string s = dfa_of_nfa (nfa_of_regex (regex_of_string s)) (* * Here are the patterns we want to match. *) let patterns = ["[ \t\n][ \t\n]*"; (* white space *) "//[^\n]*"; (* comments *) "[0123456789][0123456789]*"; (* numbers *) (* Words *) "[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_']*"; (* Other symbols *) "]+[.,;!=+-*/%?><]"] let compiled = List.map (fun s -> let regex = regex_of_string s in let nfa = nfa_of_regex regex in let dfa = dfa_of_nfa nfa in regex, nfa, dfa) patterns let dfas = List.map (fun (_, _, dfa) -> dfa) compiled (* * For debugging, print the regular expressions and the DFAs. *) let _ = if false then List.iter (fun (regex, nfa, dfa) -> printf "@[Regex: %a@ %a@]@." (**) pp_print_regex regex pp_print_nfa nfa) compiled (* * Get the argument string from the command line. *) let filename = let argv = Sys.argv in let len = Array.length argv in if len <> 2 then begin eprintf "Usage: %s @." argv.(0); exit 1 end; argv.(1) (* * Main function. *) let _ = parse_file dfas filename (*! * @docoff * * -*- * Local Variables: * Caml-master: "compile" * End: * -*- *)