1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Enable this to squash warnings due to exporting pieces of the representation
// for use with the regex! macro. See lib.rs for explanation.

use self::Inst::*;

use std::cmp;
use std::iter::repeat;
use parse;
use parse::{Flags, FLAG_EMPTY};
use parse::Ast::{Nothing, Literal, Dot, AstClass, Begin, End, WordBoundary, Capture, Cat, Alt,
                 Rep};
use parse::Repeater::{ZeroOne, ZeroMore, OneMore};

pub type InstIdx = usize;

/// An instruction, the underlying unit of a compiled regular expression
#[derive(Show, Clone)]
pub enum Inst {
    /// When a Match instruction is executed, the current thread is successful.
    Match,

    /// The OneChar instruction matches a literal character.
    /// The flags indicate whether to do a case insensitive match.
    OneChar(char, Flags),

    /// The CharClass instruction tries to match one input character against
    /// the range of characters given.
    /// The flags indicate whether to do a case insensitive match and whether
    /// the character class is negated or not.
    CharClass(Vec<(char, char)>, Flags),

    /// Matches any character except new lines.
    /// The flags indicate whether to include the '\n' character.
    Any(Flags),

    /// Matches the beginning of the string, consumes no characters.
    /// The flags indicate whether it matches if the preceding character
    /// is a new line.
    EmptyBegin(Flags),

    /// Matches the end of the string, consumes no characters.
    /// The flags indicate whether it matches if the proceeding character
    /// is a new line.
    EmptyEnd(Flags),

    /// Matches a word boundary (\w on one side and \W \A or \z on the other),
    /// and consumes no character.
    /// The flags indicate whether this matches a word boundary or something
    /// that isn't a word boundary.
    EmptyWordBoundary(Flags),

    /// Saves the current position in the input string to the Nth save slot.
    Save(usize),

    /// Jumps to the instruction at the index given.
    Jump(InstIdx),

    /// Jumps to the instruction at the first index given. If that leads to
    /// a panic state, then the instruction at the second index given is
    /// tried.
    Split(InstIdx, InstIdx),
}

/// Program represents a compiled regular expression. Once an expression is
/// compiled, its representation is immutable and will never change.
///
/// All of the data in a compiled expression is wrapped in "MaybeStatic" or
/// "MaybeOwned" types so that a `Program` can be represented as static data.
/// (This makes it convenient and efficient for use with the `regex!` macro.)
#[derive(Clone)]
pub struct Program {
    /// A sequence of instructions.
    pub insts: Vec<Inst>,
    /// If the regular expression requires a literal prefix in order to have a
    /// match, that prefix is stored here. (It's used in the VM to implement
    /// an optimization.)
    pub prefix: String,
}

impl Program {
    /// Compiles a Regex given its AST.
    pub fn new(ast: parse::Ast) -> (Program, Vec<Option<String>>) {
        let mut c = Compiler {
            insts: Vec::with_capacity(100),
            names: Vec::with_capacity(10),
        };

        c.insts.push(Save(0));
        c.compile(ast);
        c.insts.push(Save(1));
        c.insts.push(Match);

        // Try to discover a literal string prefix.
        // This is a bit hacky since we have to skip over the initial
        // 'Save' instruction.
        let mut pre = String::with_capacity(5);
        for inst in c.insts[1..].iter() {
            match *inst {
                OneChar(c, FLAG_EMPTY) => pre.push(c),
                _ => break
            }
        }

        let Compiler { insts, names } = c;
        let prog = Program {
            insts: insts,
            prefix: pre,
        };
        (prog, names)
    }

    /// Returns the total number of capture groups in the regular expression.
    /// This includes the zeroth capture.
    pub fn num_captures(&self) -> usize {
        let mut n = 0;
        for inst in self.insts.iter() {
            match *inst {
                Save(c) => n = cmp::max(n, c+1),
                _ => {}
            }
        }
        // There's exactly 2 Save slots for every capture.
        n / 2
    }
}

struct Compiler<'r> {
    insts: Vec<Inst>,
    names: Vec<Option<String>>,
}

// The compiler implemented here is extremely simple. Most of the complexity
// in this crate is in the parser or the VM.
// The only tricky thing here is patching jump/split instructions to point to
// the right instruction.
impl<'r> Compiler<'r> {
    fn compile(&mut self, ast: parse::Ast) {
        match ast {
            Nothing => {},
            Literal(c, flags) => self.push(OneChar(c, flags)),
            Dot(nl) => self.push(Any(nl)),
            AstClass(ranges, flags) =>
                self.push(CharClass(ranges, flags)),
            Begin(flags) => self.push(EmptyBegin(flags)),
            End(flags) => self.push(EmptyEnd(flags)),
            WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
            Capture(cap, name, x) => {
                let len = self.names.len();
                if cap >= len {
                    self.names.extend(repeat(None).take(10 + cap - len))
                }
                self.names[cap] = name;

                self.push(Save(2 * cap));
                self.compile(*x);
                self.push(Save(2 * cap + 1));
            }
            Cat(xs) => {
                for x in xs.into_iter() {
                    self.compile(x)
                }
            }
            Alt(x, y) => {
                let split = self.empty_split(); // push: split 0, 0
                let j1 = self.insts.len();
                self.compile(*x);                // push: insts for x
                let jmp = self.empty_jump();    // push: jmp 0
                let j2 = self.insts.len();
                self.compile(*y);                // push: insts for y
                let j3 = self.insts.len();

                self.set_split(split, j1, j2);  // split 0, 0 -> split j1, j2
                self.set_jump(jmp, j3);         // jmp 0      -> jmp j3
            }
            Rep(x, ZeroOne, g) => {
                let split = self.empty_split();
                let j1 = self.insts.len();
                self.compile(*x);
                let j2 = self.insts.len();

                if g.is_greedy() {
                    self.set_split(split, j1, j2);
                } else {
                    self.set_split(split, j2, j1);
                }
            }
            Rep(x, ZeroMore, g) => {
                let j1 = self.insts.len();
                let split = self.empty_split();
                let j2 = self.insts.len();
                self.compile(*x);
                let jmp = self.empty_jump();
                let j3 = self.insts.len();

                self.set_jump(jmp, j1);
                if g.is_greedy() {
                    self.set_split(split, j2, j3);
                } else {
                    self.set_split(split, j3, j2);
                }
            }
            Rep(x, OneMore, g) => {
                let j1 = self.insts.len();
                self.compile(*x);
                let split = self.empty_split();
                let j2 = self.insts.len();

                if g.is_greedy() {
                    self.set_split(split, j1, j2);
                } else {
                    self.set_split(split, j2, j1);
                }
            }
        }
    }

    /// Appends the given instruction to the program.
    #[inline]
    fn push(&mut self, x: Inst) {
        self.insts.push(x)
    }

    /// Appends an *empty* `Split` instruction to the program and returns
    /// the index of that instruction. (The index can then be used to "patch"
    /// the actual locations of the split in later.)
    #[inline]
    fn empty_split(&mut self) -> InstIdx {
        self.insts.push(Split(0, 0));
        self.insts.len() - 1
    }

    /// Sets the left and right locations of a `Split` instruction at index
    /// `i` to `pc1` and `pc2`, respectively.
    /// If the instruction at index `i` isn't a `Split` instruction, then
    /// `panic!` is called.
    #[inline]
    fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
        let split = &mut self.insts[i];
        match *split {
            Split(_, _) => *split = Split(pc1, pc2),
            _ => panic!("BUG: Invalid split index."),
        }
    }

    /// Appends an *empty* `Jump` instruction to the program and returns the
    /// index of that instruction.
    #[inline]
    fn empty_jump(&mut self) -> InstIdx {
        self.insts.push(Jump(0));
        self.insts.len() - 1
    }

    /// Sets the location of a `Jump` instruction at index `i` to `pc`.
    /// If the instruction at index `i` isn't a `Jump` instruction, then
    /// `panic!` is called.
    #[inline]
    fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
        let jmp = &mut self.insts[i];
        match *jmp {
            Jump(_) => *jmp = Jump(pc),
            _ => panic!("BUG: Invalid jump index."),
        }
    }
}