# HG changeset patch # User Adam Chlipala # Date 1223412646 14400 # Node ID a08e82c646a522d35a6647546ba3562576668df7 # Parent 41392e5acbf563a764db9309338bdf78b8b2582c Comment regexp matcher diff -r 41392e5acbf5 -r a08e82c646a5 src/MoreDep.v --- a/src/MoreDep.v Tue Oct 07 16:08:58 2008 -0400 +++ b/src/MoreDep.v Tue Oct 07 16:50:46 2008 -0400 @@ -346,6 +346,10 @@ (** * A Certified Regular Expression Matcher *) +(** Another interesting example is regular expressions with dependent types that express which predicates over strings particular regexps implement. We can then assign a dependent type to a regular expression matching function, guaranteeing that it always decides the string property that we expect it to decide. + + Before defining the syntax of expressions, it is helpful to define an inductive type capturing the meaning of the Kleene star. We use Coq's string support, which comes through a combination of the [Strings] library and some parsing notations built into Coq. Operators like [++] and functions like [length] that we know from lists are defined again for strings. Notation scopes help us control which versions we want to use in particular contexts. *) + Require Import Ascii String. Open Scope string_scope. @@ -360,6 +364,23 @@ -> star (s1 ++ s2). End star. +(** Now we can make our first attempt at defining a [regexp] type that is indexed by predicates on strings. Here is a reasonable-looking definition that is restricted to constant characters and concatenation. + +[[ +Inductive regexp : (string -> Prop) -> Set := +| Char : forall ch : ascii, + regexp (fun s => s = String ch "") +| Concat : forall (P1 P2 : string -> Prop) (r1 : regexp P1) (r2 : regexp P2), + regexp (fun s => exists s1, exists s2, s = s1 ++ s2 /\ P1 s1 /\ P2 s2). + +[[ +User error: Large non-propositional inductive types must be in Type +]] + +What is a large inductive type? In Coq, it is an inductive type that has a constructor which quantifies over some type of type [Type]. We have not worked with [Type] very much to this point. Every term of CIC has a type, including [Set] and [Prop], which are assigned type [Type]. The type [string -> Prop] from the failed definition also has type [Type]. + +It turns out that allowing large inductive types in [Set] leads to contradictions when combined with certain kinds of classical logic reasoning. Thus, by default, such types are ruled out. There is a simple fix for our [regexp] definition, which is to place our new type in [Type]. While fixing the problem, we also expand the list of constructors to cover the remaining regular expression operators. *) + Inductive regexp : (string -> Prop) -> Type := | Char : forall ch : ascii, regexp (fun s => s = String ch "") @@ -370,6 +391,9 @@ | Star : forall P (r : regexp P), regexp (star P). +(** Many theorems about strings are useful for implementing a certified regexp matcher, and few of them are in the [Strings] library. The book source includes statements, proofs, and hint commands for a handful of such omittted theorems. Since they are orthogonal to our use of dependent types, we hide them in the rendered versions of this book. *) + +(* begin hide *) Open Scope specif_scope. Lemma length_emp : length "" <= 0. @@ -397,7 +421,7 @@ Qed. Lemma substring_none : forall s n, - substring n 0 s = EmptyString. + substring n 0 s = "". induction s; substring. Qed. @@ -430,13 +454,20 @@ Qed. Hint Rewrite substring_app_fst substring_app_snd using (trivial; fail) : cpdt. +(* end hide *) + +(** A few auxiliary functions help us in our final matcher definition. The function [split] will be used to implement the regexp concatenation case. *) Section split. Variables P1 P2 : string -> Prop. Variable P1_dec : forall s, {P1 s} + { ~P1 s}. Variable P2_dec : forall s, {P2 s} + { ~P2 s}. + (** We require a choice of two arbitrary string predicates and functions for deciding them. *) Variable s : string. + (** Our computation will take place relative to a single fixed string, so it is easiest to make it a [Variable], rather than an explicit argument to our functions. *) + + (** [split'] is the workhorse behind [split]. It searches through the possible ways of splitting [s] into two pieces, checking the two predicates against each such pair. [split'] progresses right-to-left, from splitting all of [s] into the first piece to splitting all of [s] into the second piece. It takes an extra argument, [n], which specifies how far along we are in this search process. *) Definition split' (n : nat) : n <= length s -> {exists s1, exists s2, length s1 <= n /\ s1 ++ s2 = s /\ P1 s1 /\ P2 s2} @@ -448,7 +479,8 @@ -> {exists s1, exists s2, length s1 <= n /\ s1 ++ s2 = s /\ P1 s1 /\ P2 s2} + {forall s1 s2, length s1 <= n -> s1 ++ s2 = s -> ~P1 s1 \/ ~P2 s2} with | O => fun _ => Reduce (P1_dec "" && P2_dec s) - | S n' => fun _ => (P1_dec (substring 0 (S n') s) && P2_dec (substring (S n') (length s - S n') s)) + | S n' => fun _ => (P1_dec (substring 0 (S n') s) + && P2_dec (substring (S n') (length s - S n') s)) || F n' _ end); clear F; crush; eauto 7; match goal with @@ -458,6 +490,18 @@ end; crush. Defined. + (** There is one subtle point in the [split'] code that is worth mentioning. The main body of the function is a [match] on [n]. In the case where [n] is known to be [S n'], we write [S n'] in several places where we might be tempted to write [n]. However, without further work to craft proper [match] annotations, the type-checker does not use the equality between [n] and [S n']. Thus, it is common to see patterns repeated in [match] case bodies in dependently-typed Coq code. We can at least use a [let] expression to avoid copying the pattern more than once, replacing the first case body with: + + [[ + + | S n' => fun _ => let n := S n' in + (P1_dec (substring 0 n s) + && P2_dec (substring n (length s - n) s)) + || F n' _ + ]] + + [split] itself is trivial to implement in terms of [split']. We just ask [split'] to begin its search with [n = length s]. *) + Definition split : {exists s1, exists s2, s = s1 ++ s2 /\ P1 s1 /\ P2 s2} + {forall s1 s2, s = s1 ++ s2 -> ~P1 s1 \/ ~P2 s2}. refine (Reduce (split' (n := length s) _)); crush; eauto. @@ -466,6 +510,7 @@ Implicit Arguments split [P1 P2]. +(* begin hide *) Lemma app_empty_end : forall s, s ++ "" = s. induction s; crush. Qed. @@ -557,11 +602,17 @@ Qed. Hint Rewrite minus_minus using omega : cpdt. +(* end hide *) + +(** One more helper function will come in handy: [dec_star], for implementing another linear search through ways of splitting a string, this time for implementing the Kleene star. *) Section dec_star. Variable P : string -> Prop. Variable P_dec : forall s, {P s} + { ~P s}. + (** Some new lemmas and hints about the [star] type family are useful here. We omit them here; they are included in the book source at this point. *) + + (* begin hide *) Hint Constructors star. Lemma star_empty : forall s, @@ -623,28 +674,38 @@ | [ H : star _ _ |- _ ] => generalize (star_inv H); do 3 crush; eauto end. Qed. + (* end hide *) + + (** The function [dec_star''] implements a single iteration of the star. That is, it tries to find a string prefix matching [P], and it calls a parameter function on the remainder of the string. *) Section dec_star''. Variable n : nat. + (** [n] is the length of the prefix of [s] that we have already processed. *) Variable P' : string -> Prop. Variable P'_dec : forall n' : nat, n' > n -> {P' (substring n' (length s - n') s)} + { ~P' (substring n' (length s - n') s)}. + (** When we use [dec_star''], we will instantiate [P'_dec] with a function for continuing the search for more instances of [P] in [s]. *) + + (** Now we come to [dec_star''] itself. It takes as an input a natural [l] that records how much of the string has been searched so far, as we did for [split']. The return type expresses that [dec_star''] is looking for an index into [s] that splits [s] into a nonempty prefix and a suffix, such that the prefix satisfies [P] and the suffix satisfies [P']. *) Definition dec_star'' (l : nat) : {exists l', S l' <= l /\ P (substring n (S l') s) /\ P' (substring (n + S l') (length s - (n + S l')) s)} + {forall l', S l' <= l - -> ~P (substring n (S l') s) \/ ~P' (substring (n + S l') (length s - (n + S l')) s)}. + -> ~P (substring n (S l') s) + \/ ~P' (substring (n + S l') (length s - (n + S l')) s)}. refine (fix F (l : nat) : {exists l', S l' <= l /\ P (substring n (S l') s) /\ P' (substring (n + S l') (length s - (n + S l')) s)} + {forall l', S l' <= l - -> ~P (substring n (S l') s) \/ ~P' (substring (n + S l') (length s - (n + S l')) s)} := + -> ~P (substring n (S l') s) + \/ ~P' (substring (n + S l') (length s - (n + S l')) s)} := match l return {exists l', S l' <= l /\ P (substring n (S l') s) /\ P' (substring (n + S l') (length s - (n + S l')) s)} + {forall l', S l' <= l -> - ~P (substring n (S l') s) \/ ~P' (substring (n + S l') (length s - (n + S l')) s)} with + ~P (substring n (S l') s) + \/ ~P' (substring (n + S l') (length s - (n + S l')) s)} with | O => _ | S l' => (P_dec (substring n (S l') s) && P'_dec (n' := n + S l') _) @@ -656,6 +717,7 @@ Defined. End dec_star''. + (* begin hide *) Lemma star_length_contra : forall n, length s > n -> n >= length s @@ -671,16 +733,19 @@ Qed. Hint Resolve star_length_contra star_length_flip substring_suffix_emp. + (* end hide *) + (** The work of [dec_star''] is nested inside another linear search by [dec_star'], which provides the final functionality we need, but for arbitrary suffixes of [s], rather than just for [s] overall. *) + Definition dec_star' (n n' : nat) : length s - n' <= n -> {star P (substring n' (length s - n') s)} - + {~star P (substring n' (length s - n') s)}. + + { ~star P (substring n' (length s - n') s)}. refine (fix F (n n' : nat) {struct n} : length s - n' <= n -> {star P (substring n' (length s - n') s)} - + {~star P (substring n' (length s - n') s)} := + + { ~star P (substring n' (length s - n') s)} := match n return length s - n' <= n -> {star P (substring n' (length s - n') s)} - + {~star P (substring n' (length s - n') s)} with + + { ~star P (substring n' (length s - n') s)} with | O => fun _ => Yes | S n'' => fun _ => le_gt_dec (length s) n' @@ -695,6 +760,8 @@ end. Defined. + (** Finally, we have [dec_star]. It has a straightforward implementation. We introduce a spurious match on [s] so that [simpl] will know to reduce calls to [dec_star]. The heuristic that [simpl] uses is only to unfold identifier definitions when doing so would simplify some [match] expression. *) + Definition dec_star : {star P s} + { ~star P s}. refine (match s with | "" => Reduce (dec_star' (n := length s) 0 _) @@ -703,6 +770,7 @@ Defined. End dec_star. +(* begin hide *) Lemma app_cong : forall x1 y1 x2 y2, x1 = x2 -> y1 = y2 @@ -711,6 +779,9 @@ Qed. Hint Resolve app_cong. +(* end hide *) + +(** With these helper functions completed, the implementation of our [matches] function is refreshingly straightforward. We only need one small piece of specific tactic work beyond what [crush] does for us. *) Definition matches P (r : regexp P) s : {P s} + { ~P s}. refine (fix F P (r : regexp P) s : {P s} + { ~P s} := @@ -722,10 +793,10 @@ end); crush; match goal with | [ H : _ |- _ ] => generalize (H _ _ (refl_equal _)) - end; - tauto. + end; tauto. Defined. +(* begin hide *) Example hi := Concat (Char "h"%char) (Char "i"%char). Eval simpl in matches hi "hi". Eval simpl in matches hi "bye". @@ -741,3 +812,4 @@ Eval simpl in matches a_star "a". Eval simpl in matches a_star "b". Eval simpl in matches a_star "aa". +(* end hide *)