(* * This file implements operations on Determistic Finite Automata (DFAs). * * ---------------------------------------------------------------- * * @begin[license] * Copyright (C) 2002 Jason Hickey, Caltech * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Jason Hickey * @email{jyh@cs.caltech.edu} * @end[license] *) open Format (************************************************************************ * Modules and type definitions. *) (* * The data structure that represents a set of integers. * We'll use small integers only, so the (-) function is * a valid comparison. *) module StateCompare = struct type t = int let compare = (-) end module StateSet = Set.Make (StateCompare) type state = int type state_set = StateSet.t (* * We use the ASCII characters as the symbols * in this lab. *) type symbol = char module SymbolCompare = struct type t = symbol let compare = Pervasives.compare end module SymbolSet = Set.Make (SymbolCompare) (* * For this lab, we'll use a simplified representation of * automata. The states of a DFA are finite sets of integers, * and the alphabet is always char. The reason why we do this * is mainly to avoid the module system. We want to be able * to build DFAs at run-time. * * In this simplified representation, a DFA is a 3-tuple: * dfa_delta : the transition function. * dfa_start : the start state * dfa_is_final : a final state predicate. *) type dfa = { dfa_delta : state_set -> symbol -> state_set; dfa_start : state_set; dfa_is_final : state_set -> bool } (* * We also use the simplified representation for NFAs. * * Formally, an NFA with epsilon-transitions is a 6-tuple: * state : the type of states. * symbol : the type of symbols. * delta_normal : the transition relation. To simplify the encoding, * the relation is expressed as a function that produces * a state set. * delta_epsilon : the epsilon-transitions, again expressed * as a state set. * start : the start state. * is_final : a final state predicate. * * In the simplified representation, we make the following assumptions: * 1. The states are integers in the range [0, nfa_states) * 2. The symbols are the "symbol" type (char) * 3. The start state is always state 0 *) type nfa = { nfa_states : state; nfa_delta_normal : state -> symbol -> state_set; nfa_delta_epsilon : state -> state_set; nfa_is_final : state -> bool } (* * The type of regular expressions. * * Empty == {} * Epsilon == { epsilon } * Concat (s, t) == st * Choice (s, t) == s + t * Kleene s == s* * * We optimize symbols slightly. * We allow a set of symbols in the regex: * Symbols { a1, ..., an } == (a1 + ... + an) *) type regex = Empty | Epsilon | Symbols of SymbolSet.t | Concat of regex * regex | Choice of regex * regex | Kleene of regex (************************************************************************ * Produce a DFA from an NFA. *) let dfa_of_nfa nfa = let { nfa_states = states; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } = nfa in (* * Build the epsilon-closure of a given state. * This is the fixpoint of all epsilon-transitions. *) let rec closure q_set = let q_set' = StateSet.fold (fun q q_result -> StateSet.union q_result (delta_epsilon q)) q_set q_set in if StateSet.equal q_set' q_set then q_set else closure q_set' in (* * Transition function. *) let delta q_set c = let q_new = StateSet.fold (fun q q_new -> StateSet.union q_new (delta_normal q c)) q_set StateSet.empty in closure q_new in (* * Initial state is the set containing the initial NFA state. *) let start = closure (StateSet.singleton 0) in (* * A state is final if any of the state elements * are final in the NFA. *) let is_final q_set = StateSet.exists is_final q_set in (* The final DFA *) { dfa_delta = delta; dfa_start = start; dfa_is_final = is_final } (************************************************************************ * Regular expressions. * * In this section, we build NFAs from regular expressions. *) (* * The NFA for the empty language. *) let regex_empty = { nfa_states = 1; nfa_delta_normal = (fun _ _ -> StateSet.empty); nfa_delta_epsilon = (fun _ -> StateSet.empty); nfa_is_final = (fun _ -> false) } (* * The NFA that accepts the empty string. *) let regex_epsilon = { nfa_states = 1; nfa_delta_normal = (fun _ _ -> StateSet.empty); nfa_delta_epsilon = (fun _ -> StateSet.empty); nfa_is_final = (fun q -> q = 0) } (* * The NFA that accepts a single symbol. *) let regex_symbol_set c_set = let state_one = StateSet.singleton 1 in let delta_normal q c = if q = 0 && SymbolSet.mem c c_set then state_one else StateSet.empty in { nfa_states = 2; nfa_delta_normal = delta_normal; nfa_delta_epsilon = (fun _ -> StateSet.empty); nfa_is_final = (fun q -> q = 1) } (* * The concatenation NFA. * * The construction is as follows: * 1. The number of states is (nfa1.nfa_states + nfa2.nfa_states) * All the states with numbers less than nfa1.states1 correspond * to machine1. For any other state q, the state corresponds to * (q - states1) in machine2. * * 2. Add an epsilon transition from each final state in machine1 * to the initial state in machine2 (this is the state with * numbered states1). * * 3. A state is final iff it is a final state in machine2. *) let regex_concat nfa1 nfa2 = let { nfa_states = states1; nfa_delta_normal = delta_normal1; nfa_delta_epsilon = delta_epsilon1; nfa_is_final = is_final1 } = nfa1 in let { nfa_states = states2; nfa_delta_normal = delta_normal2; nfa_delta_epsilon = delta_epsilon2; nfa_is_final = is_final2 } = nfa2 in (* * State manipulation functions. * * You don't have to be as general as this in your solution. * We do this mainly for explanitory purposes. *) let is_state1 q = q < states1 in let is_state2 q = q >= states1 in let state_of_state1 q = q in let state_of_state2 q = q + states1 in let state1_of_state q = q in let state2_of_state q = q - states1 in (* * Translate a set of states to the representation * in the current, constructed machine. *) let translate_states1 q_set = q_set in let translate_states2 q_set = StateSet.fold (fun q q_set -> StateSet.add (state_of_state2 q) q_set) q_set StateSet.empty in (* * Transition function. If the state belongs to machine1, * then use the transition relation for machine1. Otherwise * use the transition relation for machine2, offset by * states1. *) let delta_normal q c = if is_state1 q then translate_states1 (delta_normal1 (state1_of_state q) c) else translate_states2 (delta_normal2 (state2_of_state q) c) in (* * Epsilon transitions. * This is computed mainly the same way as for the normal * transition function, but we add an epsilon transition from * any final state in machine1 to the initial state in machine2 * (the initial state is numbered states1). *) let delta_epsilon q = if is_state1 q then let q = state1_of_state q in let q_set = translate_states1 (delta_epsilon1 q) in if is_final1 q then StateSet.add (state_of_state2 0) q_set else q_set else (* is_state2 q *) translate_states2 (delta_epsilon2 (state2_of_state q)) in (* * A state is final iff it is a final state in machine2. *) let is_final q = is_state2 q && is_final2 (state2_of_state q) in (* The final automaton *) { nfa_states = states1 + states2; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } (* * The union NFA. * * The construction is as follows: * 1. The number of states is (nfa1.nfa_states + nfa2.nfa_states + 1) * The state classes are: * * 0: the initial state, which has epsilon transitions to * start states for the two machines: 1 and (states1 + 1). * [1, states1]: these states correspond to the states in machine1. * [states1 + 1, states1 + states2]: these state correspong to the * states of machine 2. * * 2. Add an epsilon transition from the initial state to the start * states 1 and (states1 + 1). * * 3. A state is final iff it is a final state in either machine. *) let regex_choice nfa1 nfa2 = let { nfa_states = states1; nfa_delta_normal = delta_normal1; nfa_delta_epsilon = delta_epsilon1; nfa_is_final = is_final1 } = nfa1 in let { nfa_states = states2; nfa_delta_normal = delta_normal2; nfa_delta_epsilon = delta_epsilon2; nfa_is_final = is_final2 } = nfa2 in (* * State manipulation functions. * * You don't have to be as general as this in your solution. * We do this mainly for explanitory purposes. *) let is_state1 q = q > 0 && q <= states1 in let is_state2 q = q > states1 in let state_of_state1 q = q + 1 in let state_of_state2 q = q + states1 + 1 in let state1_of_state q = q - 1 in let state2_of_state q = q - states1 - 1 in (* * Translate a set of states to the representation * in the current, constructed machine. *) let translate_states1 q_set = StateSet.fold (fun q q_set -> StateSet.add (state_of_state1 q) q_set) q_set StateSet.empty in let translate_states2 q_set = StateSet.fold (fun q q_set -> StateSet.add (state_of_state2 q) q_set) q_set StateSet.empty in (* * Transition function. If the state belongs to machine1, * then use the transition relation for machine1. Otherwise * use the transition relation for machine2, offset by * states1. *) let delta_normal q c = if q = 0 then StateSet.empty else if is_state1 q then translate_states1 (delta_normal1 (state1_of_state q) c) else (* is_state2 q *) translate_states2 (delta_normal2 (state2_of_state q) c) in (* * Epsilon transitions. * This is computed mainly the same way as for the normal * transition function, but for the start state we add eps-transitions to * the start states of the two sub-machines. *) let delta_epsilon q = if q = 0 then StateSet.add (state_of_state1 0) (StateSet.add (state_of_state2 0) StateSet.empty) else if is_state1 q then translate_states1 (delta_epsilon1 (state1_of_state q)) else (* is_state2 q *) translate_states2 (delta_epsilon2 (state2_of_state q)) in (* * A state is final iff it is a final state in machine2. *) let is_final q = is_state1 q && is_final1 (state1_of_state q) || is_state2 q && is_final2 (state2_of_state q) in (* The final automaton *) { nfa_states = states1 + states2 + 1; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } (* * The Kleene closure NFA. * * The automaton is the same as the original automaton, except: * a. the start state is final * b. there is an epsilon transition from each final state * back to the initial state. *) let regex_kleene nfa = let { nfa_states = states; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } = nfa in (* * Epsilon transitions. * This is computed mainly the same way as for the normal * transition function, but we add an epsilon transition from * any final state back to the initial state. *) let delta_epsilon' q = let q_set = delta_epsilon q in if is_final q then StateSet.add 0 q_set else q_set in (* * A state is final iff it is a final state in machine2. *) let is_final' q = q = 0 || is_final q in (* The final automaton *) { nfa_states = states; nfa_delta_normal = delta_normal; (* Unchanged *) nfa_delta_epsilon = delta_epsilon'; nfa_is_final = is_final' } (* * Compile the regular expression to a NFA description. *) let rec nfa_of_regex regex = match regex with Empty -> regex_empty | Epsilon -> regex_epsilon | Symbols c_set -> regex_symbol_set c_set | Concat (r1, r2) -> regex_concat (nfa_of_regex r1) (nfa_of_regex r2) | Choice (r1, r2) -> regex_choice (nfa_of_regex r1) (nfa_of_regex r2) | Kleene r -> regex_kleene (nfa_of_regex r) (* * Compile the regex to a DFA. *) let dfa_of_regex regex = dfa_of_nfa (nfa_of_regex regex) (************************************************************************ * Test programs. * * For the lab, you don't need to know the test program. * In fact, it would not normally be included in this file, * but we wanted to make compiling easier by putting everything * in one file. * * What we want to do is scan the input against multiple * regular expressions. The expressions are first compiled * to DFAs. Then we scan the input one character at a time * to see if some prefix matches against one of the regular * expressions. *) (************************************************************************ * Printing routines. You may want to use these for debugging. * You can print out a representation for a regular expression, * and for an NFA, but there is no printer currently for a DFA. *) let rec pp_print_regex buf regex = match regex with Empty -> pp_print_string buf "\\empty" | Epsilon -> pp_print_string buf "\\epsilon" | Symbols c_set -> fprintf buf "[@["; SymbolSet.iter (fun c -> fprintf buf "@ %s" (Char.escaped c)) c_set; fprintf buf "@]]" | Concat (r1, r2) -> fprintf buf "%a%a" pp_print_regex r1 pp_print_regex r2 | Choice (r1, r2) -> fprintf buf "(@[%a@ + %a@])" pp_print_regex r1 pp_print_regex r2 | Kleene r -> fprintf buf "(%a)*" pp_print_regex r let pp_print_state_set buf q_set = StateSet.iter (fun q -> fprintf buf "@ %d" q) q_set let pp_print_nfa buf nfa = let { nfa_states = states; nfa_delta_normal = delta_normal; nfa_delta_epsilon = delta_epsilon; nfa_is_final = is_final } = nfa in fprintf buf "@[@[NFA {@ states = %d;" states; for q = 0 to pred states do fprintf buf "@ @[@[state[%d] {" q; fprintf buf "@ @[@[normal {"; for c = 0 to 255 do let c = Char.chr c in let q_set = delta_normal q c in if not (StateSet.is_empty q_set) then fprintf buf "@ '%s' -> %a" (Char.escaped c) pp_print_state_set q_set done; fprintf buf "@]@ }@]@ @[epsilon = %a@];" pp_print_state_set (delta_epsilon q); if is_final q then fprintf buf "@ final"; fprintf buf "@]@ }@]" done; fprintf buf "@]@ }@]" (************************************************************************ * Stream inputs. We basically cheat herew, and treat the input * file a one big string. These routines read the entire file, * turn it into a string, then provide functional access routines * to access the string. *) type stream = { offset : int; buffer : string } let stream_of_file name = let s = try let inx = open_in name in let buf = Buffer.create 1024 in let rec loop () = let eof = try let c = input_char inx in Buffer.add_char buf c; false with End_of_file -> true in if not eof then loop () in loop (); close_in inx; Buffer.contents buf with Sys_error _ -> eprintf "Can't read file %s@." name; exit 2 in { offset = 0; buffer = s } (* * Test for end-of-file. *) let stream_eof { offset = off; buffer = buf } = off = String.length buf (* * Get the next character from the stream, *) let stream_get stream = let { offset = off; buffer = buf } = stream in let _ = if off = String.length buf then raise End_of_file in let c = buf.[off] in let stream = { stream with offset = succ off } in stream, c (* * Get the substring represented by two streams. * We assume they are valid and point to the same buffer. *) let stream_substring { offset = off1 } { offset = off2; buffer = buf } = String.sub buf off1 (off2 - off1) (************************************************************************ * Pattern matching. * * Simulate the DFA to determine if the DFA matches some * prefix of the stream. Raises Not_found if the DFA does * not match. *) let dfa_match dfa stream = let { dfa_delta = delta; dfa_start = start; dfa_is_final = is_final } = dfa in (* * Simulate the DFA. * q: the current state * result: the stream at the last final state * stream: the current input stream * * Returns a stream option that represents the * stream for the longest match. *) let rec simulate q result stream = (* * If this is a final state, then remember it as the result. *) let result = if is_final q then Some stream else result in (* * If the state is empty, then the machine * has rejected the input string. *) if StateSet.is_empty q then result else try let stream, c = stream_get stream in let q = dfa.dfa_delta q c in simulate q result stream with End_of_file -> result in simulate start None stream (* * Match the stream against a set of expressions. * Return the first match, or raise Not_found if * there is no match. *) let dfa_match_list dfas stream = let rec search dfas = match dfas with dfa :: dfas -> (match dfa_match dfa stream with Some stream' -> (* This machine matched, extract the substring *) let s = stream_substring stream stream' in stream', s | None -> (* Try the next one *) search dfas) | [] -> (* None of the patterns matched *) raise Not_found in search dfas (* * Parse all strings in the input against the DFA list. * For each input expression that matches, print the string * that was matched. *) let parse_file dfas file = let rec parse stream = if not (stream_eof stream) then try let stream, s = dfa_match_list dfas stream in printf "Input symbol: \"%s\"@." (String.escaped s); parse stream with Not_found -> let _, c = stream_get stream in eprintf "Syntax error: char %d = '%s'@." stream.offset (Char.escaped c); exit 1 | End_of_file -> () in parse (stream_of_file file) (************************************************************************ * Produce a regex from a string. You aren't supposed to realize * this yet, but we are effectively using a pushdown automaton to * turn the string into a regular expression. * * Firt, to implement the complement symbol set, we need the set * of all characters in the alphabet. *) let all_symbols = let rec loop s i = if i > 255 then s else loop (SymbolSet.add (Char.chr i) s) (succ i) in loop SymbolSet.empty 0 (* * Build a regex expression from a string. *) let regex_of_string s = let len = String.length s in (* Concatenate all the regular expressions in the list in reverse order *) let rec concat stack = match stack with [] -> Empty | [r] -> r | r1 :: stack -> Concat (concat stack, r1) in (* The scanner is a stack machine *) let rec scan stack i = if i >= len then stack, i else match s.[i] with '[' -> let stack, i = scan_range stack (succ i) in scan stack i | '+' -> let stack, i = scan stack (succ i) in let stack = match stack with r1 :: r2 :: stack -> Choice (r1, r2) :: stack | _ -> raise (Failure "+ operator in regex") in scan stack i | '*' -> let stack = match stack with r :: stack -> Kleene r :: stack | _ -> raise (Failure "* operator in regex") in scan stack (succ i) | '(' -> let stack', i = scan [] (succ i) in let stack = concat stack' :: stack in scan stack (succ i) | c -> let stack = Symbols (SymbolSet.singleton c) :: stack in scan stack (succ i) and scan_range stack i = if i = len then stack, i else if s.[i] = '^' then let set, i = scan_chars SymbolSet.empty (succ i) in let stack = Symbols (SymbolSet.diff all_symbols set) :: stack in stack, i else let set, i = scan_chars SymbolSet.empty i in let stack = Symbols set :: stack in stack, i and scan_chars set i = if i = len then set, i else let c = s.[i] in if c = ']' then set, succ i else scan_chars (SymbolSet.add c set) (succ i) in (* Now parse the input string *) let stack, _ = scan [] 0 in concat stack (* * Compile the string representing the regular expression * to a DFA. *) let dfa_of_string s = dfa_of_nfa (nfa_of_regex (regex_of_string s)) (* * Here are the patterns we want to match. *) let patterns = ["[ \t\n][ \t\n]*"; (* white space *) "//[^\n]*"; (* comments *) "[0123456789][0123456789]*"; (* numbers *) (* Words *) "[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_']*"; (* Other symbols *) "]+[.,;!=+-*/%?><]"] let compiled = List.map (fun s -> let regex = regex_of_string s in let nfa = nfa_of_regex regex in let dfa = dfa_of_nfa nfa in regex, nfa, dfa) patterns let dfas = List.map (fun (_, _, dfa) -> dfa) compiled (* * For debugging, print the regular expressions and the DFAs. *) let _ = if false then List.iter (fun (regex, nfa, dfa) -> printf "@[Regex: %a@ %a@]@." (**) pp_print_regex regex pp_print_nfa nfa) compiled (* * Get the argument string from the command line. *) let filename = let argv = Sys.argv in let len = Array.length argv in if len <> 2 then begin eprintf "Usage: %s @." argv.(0); exit 1 end; argv.(1) (* * Main function. *) let _ = parse_file dfas filename (*! * @docoff * * -*- * Local Variables: * Caml-master: "compile" * End: * -*- *)