Parent Directory
|
Revision Log
Upgrade to SpiderMonkey from Firefox 3.5.3.
1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
2 | * vim: set sw=4 ts=8 et tw=78: |
3 | * |
4 | * ***** BEGIN LICENSE BLOCK ***** |
5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
6 | * |
7 | * The contents of this file are subject to the Mozilla Public License Version |
8 | * 1.1 (the "License"); you may not use this file except in compliance with |
9 | * the License. You may obtain a copy of the License at |
10 | * http://www.mozilla.org/MPL/ |
11 | * |
12 | * Software distributed under the License is distributed on an "AS IS" basis, |
13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
14 | * for the specific language governing rights and limitations under the |
15 | * License. |
16 | * |
17 | * The Original Code is Mozilla Communicator client code, released |
18 | * March 31, 1998. |
19 | * |
20 | * The Initial Developer of the Original Code is |
21 | * Netscape Communications Corporation. |
22 | * Portions created by the Initial Developer are Copyright (C) 1998 |
23 | * the Initial Developer. All Rights Reserved. |
24 | * |
25 | * Contributor(s): |
26 | * |
27 | * Alternatively, the contents of this file may be used under the terms of |
28 | * either of the GNU General Public License Version 2 or later (the "GPL"), |
29 | * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
30 | * in which case the provisions of the GPL or the LGPL are applicable instead |
31 | * of those above. If you wish to allow use of your version of this file only |
32 | * under the terms of either the GPL or the LGPL, and not to allow others to |
33 | * use your version of this file under the terms of the MPL, indicate your |
34 | * decision by deleting the provisions above and replace them with the notice |
35 | * and other provisions required by the GPL or the LGPL. If you do not delete |
36 | * the provisions above, a recipient may use your version of this file under |
37 | * the terms of any one of the MPL, the GPL or the LGPL. |
38 | * |
39 | * ***** END LICENSE BLOCK ***** */ |
40 | |
41 | /* |
42 | * JS regular expressions, after Perl. |
43 | */ |
44 | #include "jsstddef.h" |
45 | #include <stdlib.h> |
46 | #include <string.h> |
47 | #include <stdarg.h> |
48 | #include "jstypes.h" |
49 | #include "jsarena.h" /* Added by JSIFY */ |
50 | #include "jsutil.h" /* Added by JSIFY */ |
51 | #include "jsapi.h" |
52 | #include "jsarray.h" |
53 | #include "jsatom.h" |
54 | #include "jsbuiltins.h" |
55 | #include "jscntxt.h" |
56 | #include "jsversion.h" |
57 | #include "jsfun.h" |
58 | #include "jsgc.h" |
59 | #include "jsinterp.h" |
60 | #include "jslock.h" |
61 | #include "jsnum.h" |
62 | #include "jsobj.h" |
63 | #include "jsopcode.h" |
64 | #include "jsregexp.h" |
65 | #include "jsscan.h" |
66 | #include "jsscope.h" |
67 | #include "jsstaticcheck.h" |
68 | #include "jsstr.h" |
69 | |
70 | #ifdef JS_TRACER |
71 | #include "jstracer.h" |
72 | using namespace avmplus; |
73 | using namespace nanojit; |
74 | #endif |
75 | |
76 | typedef enum REOp { |
77 | #define REOP_DEF(opcode, name) opcode, |
78 | #include "jsreops.tbl" |
79 | #undef REOP_DEF |
80 | REOP_LIMIT /* META: no operator >= to this */ |
81 | } REOp; |
82 | |
83 | #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS) |
84 | |
85 | #ifdef REGEXP_DEBUG |
86 | const char *reop_names[] = { |
87 | #define REOP_DEF(opcode, name) name, |
88 | #include "jsreops.tbl" |
89 | #undef REOP_DEF |
90 | NULL |
91 | }; |
92 | #endif |
93 | |
94 | #ifdef __GNUC__ |
95 | static int |
96 | re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2))); |
97 | #endif |
98 | |
99 | #ifdef REGEXP_DEBUG |
100 | static int |
101 | re_debug(const char *fmt, ...) |
102 | { |
103 | va_list ap; |
104 | int retval; |
105 | |
106 | va_start(ap, fmt); |
107 | retval = vprintf(fmt, ap); |
108 | va_end(ap); |
109 | return retval; |
110 | } |
111 | |
112 | static void |
113 | re_debug_chars(const jschar *chrs, size_t length) |
114 | { |
115 | int i = 0; |
116 | |
117 | printf(" \""); |
118 | while (*chrs && i++ < length) { |
119 | putchar((char)*chrs++); |
120 | } |
121 | printf("\""); |
122 | } |
123 | #else /* !REGEXP_DEBUG */ |
124 | /* This should be optimized to a no-op by our tier-1 compilers. */ |
125 | static int |
126 | re_debug(const char *fmt, ...) |
127 | { |
128 | return 0; |
129 | } |
130 | |
131 | static void |
132 | re_debug_chars(const jschar *chrs, size_t length) |
133 | { |
134 | } |
135 | #endif /* !REGEXP_DEBUG */ |
136 | |
137 | struct RENode { |
138 | REOp op; /* r.e. op bytecode */ |
139 | RENode *next; /* next in concatenation order */ |
140 | void *kid; /* first operand */ |
141 | union { |
142 | void *kid2; /* second operand */ |
143 | jsint num; /* could be a number */ |
144 | size_t parenIndex; /* or a parenthesis index */ |
145 | struct { /* or a quantifier range */ |
146 | uintN min; |
147 | uintN max; |
148 | JSPackedBool greedy; |
149 | } range; |
150 | struct { /* or a character class */ |
151 | size_t startIndex; |
152 | size_t kidlen; /* length of string at kid, in jschars */ |
153 | size_t index; /* index into class list */ |
154 | uint16 bmsize; /* bitmap size, based on max char code */ |
155 | JSPackedBool sense; |
156 | } ucclass; |
157 | struct { /* or a literal sequence */ |
158 | jschar chr; /* of one character */ |
159 | size_t length; /* or many (via the kid) */ |
160 | } flat; |
161 | struct { |
162 | RENode *kid2; /* second operand from ALT */ |
163 | jschar ch1; /* match char for ALTPREREQ */ |
164 | jschar ch2; /* ditto, or class index for ALTPREREQ2 */ |
165 | } altprereq; |
166 | } u; |
167 | }; |
168 | |
169 | #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \ |
170 | ((c >= 'a') && (c <= 'z')) ) |
171 | #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \ |
172 | (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR)) |
173 | |
174 | #define CLASS_CACHE_SIZE 4 |
175 | |
176 | typedef struct CompilerState { |
177 | JSContext *context; |
178 | JSTokenStream *tokenStream; /* For reporting errors */ |
179 | const jschar *cpbegin; |
180 | const jschar *cpend; |
181 | const jschar *cp; |
182 | size_t parenCount; |
183 | size_t classCount; /* number of [] encountered */ |
184 | size_t treeDepth; /* maximum depth of parse tree */ |
185 | size_t progLength; /* estimated bytecode length */ |
186 | RENode *result; |
187 | size_t classBitmapsMem; /* memory to hold all class bitmaps */ |
188 | struct { |
189 | const jschar *start; /* small cache of class strings */ |
190 | size_t length; /* since they're often the same */ |
191 | size_t index; |
192 | } classCache[CLASS_CACHE_SIZE]; |
193 | uint16 flags; |
194 | } CompilerState; |
195 | |
196 | typedef struct EmitStateStackEntry { |
197 | jsbytecode *altHead; /* start of REOP_ALT* opcode */ |
198 | jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */ |
199 | jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */ |
200 | jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */ |
201 | RENode *continueNode; /* original REOP_ALT* node being stacked */ |
202 | jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */ |
203 | JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to |
204 | avoid 16-bit unsigned offset overflow */ |
205 | } EmitStateStackEntry; |
206 | |
207 | /* |
208 | * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h, |
209 | * the getters and setters take the pc of the offset, not of the opcode before |
210 | * the offset. |
211 | */ |
212 | #define ARG_LEN 2 |
213 | #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1])) |
214 | #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \ |
215 | (pc)[1] = (jsbytecode) (arg)) |
216 | |
217 | #define OFFSET_LEN ARG_LEN |
218 | #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1) |
219 | #define GET_OFFSET(pc) GET_ARG(pc) |
220 | |
221 | /* |
222 | * Maximum supported tree depth is maximum size of EmitStateStackEntry stack. |
223 | * For sanity, we limit it to 2^24 bytes. |
224 | */ |
225 | #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry)) |
226 | |
227 | /* |
228 | * The maximum memory that can be allocated for class bitmaps. |
229 | * For sanity, we limit it to 2^24 bytes. |
230 | */ |
231 | #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24) |
232 | |
233 | /* |
234 | * Functions to get size and write/read bytecode that represent small indexes |
235 | * compactly. |
236 | * Each byte in the code represent 7-bit chunk of the index. 8th bit when set |
237 | * indicates that the following byte brings more bits to the index. Otherwise |
238 | * this is the last byte in the index bytecode representing highest index bits. |
239 | */ |
240 | static size_t |
241 | GetCompactIndexWidth(size_t index) |
242 | { |
243 | size_t width; |
244 | |
245 | for (width = 1; (index >>= 7) != 0; ++width) { } |
246 | return width; |
247 | } |
248 | |
249 | static JS_ALWAYS_INLINE jsbytecode * |
250 | WriteCompactIndex(jsbytecode *pc, size_t index) |
251 | { |
252 | size_t next; |
253 | |
254 | while ((next = index >> 7) != 0) { |
255 | *pc++ = (jsbytecode)(index | 0x80); |
256 | index = next; |
257 | } |
258 | *pc++ = (jsbytecode)index; |
259 | return pc; |
260 | } |
261 | |
262 | static JS_ALWAYS_INLINE jsbytecode * |
263 | ReadCompactIndex(jsbytecode *pc, size_t *result) |
264 | { |
265 | size_t nextByte; |
266 | |
267 | nextByte = *pc++; |
268 | if ((nextByte & 0x80) == 0) { |
269 | /* |
270 | * Short-circuit the most common case when compact index <= 127. |
271 | */ |
272 | *result = nextByte; |
273 | } else { |
274 | size_t shift = 7; |
275 | *result = 0x7F & nextByte; |
276 | do { |
277 | nextByte = *pc++; |
278 | *result |= (nextByte & 0x7F) << shift; |
279 | shift += 7; |
280 | } while ((nextByte & 0x80) != 0); |
281 | } |
282 | return pc; |
283 | } |
284 | |
285 | typedef struct RECapture { |
286 | ptrdiff_t index; /* start of contents, -1 for empty */ |
287 | size_t length; /* length of capture */ |
288 | } RECapture; |
289 | |
290 | typedef struct REMatchState { |
291 | const jschar *cp; |
292 | RECapture parens[1]; /* first of 're->parenCount' captures, |
293 | allocated at end of this struct */ |
294 | } REMatchState; |
295 | |
296 | struct REBackTrackData; |
297 | |
298 | typedef struct REProgState { |
299 | jsbytecode *continue_pc; /* current continuation data */ |
300 | jsbytecode continue_op; |
301 | ptrdiff_t index; /* progress in text */ |
302 | size_t parenSoFar; /* highest indexed paren started */ |
303 | union { |
304 | struct { |
305 | uintN min; /* current quantifier limits */ |
306 | uintN max; |
307 | } quantifier; |
308 | struct { |
309 | size_t top; /* backtrack stack state */ |
310 | size_t sz; |
311 | } assertion; |
312 | } u; |
313 | } REProgState; |
314 | |
315 | typedef struct REBackTrackData { |
316 | size_t sz; /* size of previous stack entry */ |
317 | jsbytecode *backtrack_pc; /* where to backtrack to */ |
318 | jsbytecode backtrack_op; |
319 | const jschar *cp; /* index in text of match at backtrack */ |
320 | size_t parenIndex; /* start index of saved paren contents */ |
321 | size_t parenCount; /* # of saved paren contents */ |
322 | size_t saveStateStackTop; /* number of parent states */ |
323 | /* saved parent states follow */ |
324 | /* saved paren contents follow */ |
325 | } REBackTrackData; |
326 | |
327 | #define INITIAL_STATESTACK 100 |
328 | #define INITIAL_BACKTRACK 8000 |
329 | |
330 | typedef struct REGlobalData { |
331 | JSContext *cx; |
332 | JSRegExp *regexp; /* the RE in execution */ |
333 | JSBool ok; /* runtime error (out_of_memory only?) */ |
334 | size_t start; /* offset to start at */ |
335 | ptrdiff_t skipped; /* chars skipped anchoring this r.e. */ |
336 | const jschar *cpbegin; /* text base address */ |
337 | const jschar *cpend; /* text limit address */ |
338 | |
339 | REProgState *stateStack; /* stack of state of current parents */ |
340 | size_t stateStackTop; |
341 | size_t stateStackLimit; |
342 | |
343 | REBackTrackData *backTrackStack;/* stack of matched-so-far positions */ |
344 | REBackTrackData *backTrackSP; |
345 | size_t backTrackStackSize; |
346 | size_t cursz; /* size of current stack entry */ |
347 | size_t backTrackCount; /* how many times we've backtracked */ |
348 | size_t backTrackLimit; /* upper limit on backtrack states */ |
349 | } REGlobalData; |
350 | |
351 | /* |
352 | * 1. If IgnoreCase is false, return ch. |
353 | * 2. Let u be ch converted to upper case as if by calling |
354 | * String.prototype.toUpperCase on the one-character string ch. |
355 | * 3. If u does not consist of a single character, return ch. |
356 | * 4. Let cu be u's character. |
357 | * 5. If ch's code point value is greater than or equal to decimal 128 and cu's |
358 | * code point value is less than decimal 128, then return ch. |
359 | * 6. Return cu. |
360 | */ |
361 | static JS_ALWAYS_INLINE uintN |
362 | upcase(uintN ch) |
363 | { |
364 | uintN cu; |
365 | |
366 | JS_ASSERT((uintN) (jschar) ch == ch); |
367 | if (ch < 128) { |
368 | if (ch - (uintN) 'a' <= (uintN) ('z' - 'a')) |
369 | ch -= (uintN) ('a' - 'A'); |
370 | return ch; |
371 | } |
372 | |
373 | cu = JS_TOUPPER(ch); |
374 | return (cu < 128) ? ch : cu; |
375 | } |
376 | |
377 | static JS_ALWAYS_INLINE uintN |
378 | downcase(uintN ch) |
379 | { |
380 | JS_ASSERT((uintN) (jschar) ch == ch); |
381 | if (ch < 128) { |
382 | if (ch - (uintN) 'A' <= (uintN) ('Z' - 'A')) |
383 | ch += (uintN) ('a' - 'A'); |
384 | return ch; |
385 | } |
386 | |
387 | return JS_TOLOWER(ch); |
388 | } |
389 | |
390 | /* Construct and initialize an RENode, returning NULL for out-of-memory */ |
391 | static RENode * |
392 | NewRENode(CompilerState *state, REOp op) |
393 | { |
394 | JSContext *cx; |
395 | RENode *ren; |
396 | |
397 | cx = state->context; |
398 | JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren); |
399 | if (!ren) { |
400 | js_ReportOutOfScriptQuota(cx); |
401 | return NULL; |
402 | } |
403 | ren->op = op; |
404 | ren->next = NULL; |
405 | ren->kid = NULL; |
406 | return ren; |
407 | } |
408 | |
409 | /* |
410 | * Validates and converts hex ascii value. |
411 | */ |
412 | static JSBool |
413 | isASCIIHexDigit(jschar c, uintN *digit) |
414 | { |
415 | uintN cv = c; |
416 | |
417 | if (cv < '0') |
418 | return JS_FALSE; |
419 | if (cv <= '9') { |
420 | *digit = cv - '0'; |
421 | return JS_TRUE; |
422 | } |
423 | cv |= 0x20; |
424 | if (cv >= 'a' && cv <= 'f') { |
425 | *digit = cv - 'a' + 10; |
426 | return JS_TRUE; |
427 | } |
428 | return JS_FALSE; |
429 | } |
430 | |
431 | |
432 | typedef struct { |
433 | REOp op; |
434 | const jschar *errPos; |
435 | size_t parenIndex; |
436 | } REOpData; |
437 | |
438 | static JSBool |
439 | ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber, |
440 | const jschar *arg) |
441 | { |
442 | if (state->tokenStream) { |
443 | return js_ReportCompileErrorNumber(state->context, state->tokenStream, |
444 | NULL, JSREPORT_UC | flags, |
445 | errorNumber, arg); |
446 | } |
447 | return JS_ReportErrorFlagsAndNumberUC(state->context, flags, |
448 | js_GetErrorMessage, NULL, |
449 | errorNumber, arg); |
450 | } |
451 | |
452 | static JSBool |
453 | ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber) |
454 | { |
455 | return ReportRegExpErrorHelper(state, flags, errorNumber, NULL); |
456 | } |
457 | |
458 | /* |
459 | * Process the op against the two top operands, reducing them to a single |
460 | * operand in the penultimate slot. Update progLength and treeDepth. |
461 | */ |
462 | static JSBool |
463 | ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack, |
464 | intN operandSP) |
465 | { |
466 | RENode *result; |
467 | |
468 | switch (opData->op) { |
469 | case REOP_ALT: |
470 | result = NewRENode(state, REOP_ALT); |
471 | if (!result) |
472 | return JS_FALSE; |
473 | result->kid = operandStack[operandSP - 2]; |
474 | result->u.kid2 = operandStack[operandSP - 1]; |
475 | operandStack[operandSP - 2] = result; |
476 | |
477 | if (state->treeDepth == TREE_DEPTH_MAX) { |
478 | ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); |
479 | return JS_FALSE; |
480 | } |
481 | ++state->treeDepth; |
482 | |
483 | /* |
484 | * Look at both alternates to see if there's a FLAT or a CLASS at |
485 | * the start of each. If so, use a prerequisite match. |
486 | */ |
487 | if (((RENode *) result->kid)->op == REOP_FLAT && |
488 | ((RENode *) result->u.kid2)->op == REOP_FLAT && |
489 | (state->flags & JSREG_FOLD) == 0) { |
490 | result->op = REOP_ALTPREREQ; |
491 | result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr; |
492 | result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr; |
493 | /* ALTPREREQ, <end>, uch1, uch2, <next>, ..., |
494 | JUMP, <end> ... ENDALT */ |
495 | state->progLength += 13; |
496 | } |
497 | else |
498 | if (((RENode *) result->kid)->op == REOP_CLASS && |
499 | ((RENode *) result->kid)->u.ucclass.index < 256 && |
500 | ((RENode *) result->u.kid2)->op == REOP_FLAT && |
501 | (state->flags & JSREG_FOLD) == 0) { |
502 | result->op = REOP_ALTPREREQ2; |
503 | result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr; |
504 | result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index; |
505 | /* ALTPREREQ2, <end>, uch1, uch2, <next>, ..., |
506 | JUMP, <end> ... ENDALT */ |
507 | state->progLength += 13; |
508 | } |
509 | else |
510 | if (((RENode *) result->kid)->op == REOP_FLAT && |
511 | ((RENode *) result->u.kid2)->op == REOP_CLASS && |
512 | ((RENode *) result->u.kid2)->u.ucclass.index < 256 && |
513 | (state->flags & JSREG_FOLD) == 0) { |
514 | result->op = REOP_ALTPREREQ2; |
515 | result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr; |
516 | result->u.altprereq.ch2 = |
517 | ((RENode *) result->u.kid2)->u.ucclass.index; |
518 | /* ALTPREREQ2, <end>, uch1, uch2, <next>, ..., |
519 | JUMP, <end> ... ENDALT */ |
520 | state->progLength += 13; |
521 | } |
522 | else { |
523 | /* ALT, <next>, ..., JUMP, <end> ... ENDALT */ |
524 | state->progLength += 7; |
525 | } |
526 | break; |
527 | |
528 | case REOP_CONCAT: |
529 | result = operandStack[operandSP - 2]; |
530 | while (result->next) |
531 | result = result->next; |
532 | result->next = operandStack[operandSP - 1]; |
533 | break; |
534 | |
535 | case REOP_ASSERT: |
536 | case REOP_ASSERT_NOT: |
537 | case REOP_LPARENNON: |
538 | case REOP_LPAREN: |
539 | /* These should have been processed by a close paren. */ |
540 | ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN, |
541 | opData->errPos); |
542 | return JS_FALSE; |
543 | |
544 | default:; |
545 | } |
546 | return JS_TRUE; |
547 | } |
548 | |
549 | /* |
550 | * Parser forward declarations. |
551 | */ |
552 | static JSBool ParseTerm(CompilerState *state); |
553 | static JSBool ParseQuantifier(CompilerState *state); |
554 | static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues); |
555 | |
556 | /* |
557 | * Top-down regular expression grammar, based closely on Perl4. |
558 | * |
559 | * regexp: altern A regular expression is one or more |
560 | * altern '|' regexp alternatives separated by vertical bar. |
561 | */ |
562 | #define INITIAL_STACK_SIZE 128 |
563 | |
564 | static JSBool |
565 | ParseRegExp(CompilerState *state) |
566 | { |
567 | size_t parenIndex; |
568 | RENode *operand; |
569 | REOpData *operatorStack; |
570 | RENode **operandStack; |
571 | REOp op; |
572 | intN i; |
573 | JSBool result = JS_FALSE; |
574 | |
575 | intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE; |
576 | intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE; |
577 | |
578 | /* Watch out for empty regexp */ |
579 | if (state->cp == state->cpend) { |
580 | state->result = NewRENode(state, REOP_EMPTY); |
581 | return (state->result != NULL); |
582 | } |
583 | |
584 | operatorStack = (REOpData *) |
585 | JS_malloc(state->context, sizeof(REOpData) * operatorStackSize); |
586 | if (!operatorStack) |
587 | return JS_FALSE; |
588 | |
589 | operandStack = (RENode **) |
590 | JS_malloc(state->context, sizeof(RENode *) * operandStackSize); |
591 | if (!operandStack) |
592 | goto out; |
593 | |
594 | for (;;) { |
595 | parenIndex = state->parenCount; |
596 | if (state->cp == state->cpend) { |
597 | /* |
598 | * If we are at the end of the regexp and we're short one or more |
599 | * operands, the regexp must have the form /x|/ or some such, with |
600 | * left parentheses making us short more than one operand. |
601 | */ |
602 | if (operatorSP >= operandSP) { |
603 | operand = NewRENode(state, REOP_EMPTY); |
604 | if (!operand) |
605 | goto out; |
606 | goto pushOperand; |
607 | } |
608 | } else { |
609 | switch (*state->cp) { |
610 | case '(': |
611 | ++state->cp; |
612 | if (state->cp + 1 < state->cpend && |
613 | *state->cp == '?' && |
614 | (state->cp[1] == '=' || |
615 | state->cp[1] == '!' || |
616 | state->cp[1] == ':')) { |
617 | switch (state->cp[1]) { |
618 | case '=': |
619 | op = REOP_ASSERT; |
620 | /* ASSERT, <next>, ... ASSERTTEST */ |
621 | state->progLength += 4; |
622 | break; |
623 | case '!': |
624 | op = REOP_ASSERT_NOT; |
625 | /* ASSERTNOT, <next>, ... ASSERTNOTTEST */ |
626 | state->progLength += 4; |
627 | break; |
628 | default: |
629 | op = REOP_LPARENNON; |
630 | break; |
631 | } |
632 | state->cp += 2; |
633 | } else { |
634 | op = REOP_LPAREN; |
635 | /* LPAREN, <index>, ... RPAREN, <index> */ |
636 | state->progLength |
637 | += 2 * (1 + GetCompactIndexWidth(parenIndex)); |
638 | state->parenCount++; |
639 | if (state->parenCount == 65535) { |
640 | ReportRegExpError(state, JSREPORT_ERROR, |
641 | JSMSG_TOO_MANY_PARENS); |
642 | goto out; |
643 | } |
644 | } |
645 | goto pushOperator; |
646 | |
647 | case ')': |
648 | /* |
649 | * If there's no stacked open parenthesis, throw syntax error. |
650 | */ |
651 | for (i = operatorSP - 1; ; i--) { |
652 | if (i < 0) { |
653 | ReportRegExpError(state, JSREPORT_ERROR, |
654 | JSMSG_UNMATCHED_RIGHT_PAREN); |
655 | goto out; |
656 | } |
657 | if (operatorStack[i].op == REOP_ASSERT || |
658 | operatorStack[i].op == REOP_ASSERT_NOT || |
659 | operatorStack[i].op == REOP_LPARENNON || |
660 | operatorStack[i].op == REOP_LPAREN) { |
661 | break; |
662 | } |
663 | } |
664 | /* FALL THROUGH */ |
665 | |
666 | case '|': |
667 | /* Expected an operand before these, so make an empty one */ |
668 | operand = NewRENode(state, REOP_EMPTY); |
669 | if (!operand) |
670 | goto out; |
671 | goto pushOperand; |
672 | |
673 | default: |
674 | if (!ParseTerm(state)) |
675 | goto out; |
676 | operand = state->result; |
677 | pushOperand: |
678 | if (operandSP == operandStackSize) { |
679 | RENode **tmp; |
680 | operandStackSize += operandStackSize; |
681 | tmp = (RENode **) |
682 | JS_realloc(state->context, operandStack, |
683 | sizeof(RENode *) * operandStackSize); |
684 | if (!tmp) |
685 | goto out; |
686 | operandStack = tmp; |
687 | } |
688 | operandStack[operandSP++] = operand; |
689 | break; |
690 | } |
691 | } |
692 | |
693 | /* At the end; process remaining operators. */ |
694 | restartOperator: |
695 | if (state->cp == state->cpend) { |
696 | while (operatorSP) { |
697 | --operatorSP; |
698 | if (!ProcessOp(state, &operatorStack[operatorSP], |
699 | operandStack, operandSP)) |
700 | goto out; |
701 | --operandSP; |
702 | } |
703 | JS_ASSERT(operandSP == 1); |
704 | state->result = operandStack[0]; |
705 | result = JS_TRUE; |
706 | goto out; |
707 | } |
708 | |
709 | switch (*state->cp) { |
710 | case '|': |
711 | /* Process any stacked 'concat' operators */ |
712 | ++state->cp; |
713 | while (operatorSP && |
714 | operatorStack[operatorSP - 1].op == REOP_CONCAT) { |
715 | --operatorSP; |
716 | if (!ProcessOp(state, &operatorStack[operatorSP], |
717 | operandStack, operandSP)) { |
718 | goto out; |
719 | } |
720 | --operandSP; |
721 | } |
722 | op = REOP_ALT; |
723 | goto pushOperator; |
724 | |
725 | case ')': |
726 | /* |
727 | * If there's no stacked open parenthesis, throw syntax error. |
728 | */ |
729 | for (i = operatorSP - 1; ; i--) { |
730 | if (i < 0) { |
731 | ReportRegExpError(state, JSREPORT_ERROR, |
732 | JSMSG_UNMATCHED_RIGHT_PAREN); |
733 | goto out; |
734 | } |
735 | if (operatorStack[i].op == REOP_ASSERT || |
736 | operatorStack[i].op == REOP_ASSERT_NOT || |
737 | operatorStack[i].op == REOP_LPARENNON || |
738 | operatorStack[i].op == REOP_LPAREN) { |
739 | break; |
740 | } |
741 | } |
742 | ++state->cp; |
743 | |
744 | /* Process everything on the stack until the open parenthesis. */ |
745 | for (;;) { |
746 | JS_ASSERT(operatorSP); |
747 | --operatorSP; |
748 | switch (operatorStack[operatorSP].op) { |
749 | case REOP_ASSERT: |
750 | case REOP_ASSERT_NOT: |
751 | case REOP_LPAREN: |
752 | operand = NewRENode(state, operatorStack[operatorSP].op); |
753 | if (!operand) |
754 | goto out; |
755 | operand->u.parenIndex = |
756 | operatorStack[operatorSP].parenIndex; |
757 | JS_ASSERT(operandSP); |
758 | operand->kid = operandStack[operandSP - 1]; |
759 | operandStack[operandSP - 1] = operand; |
760 | if (state->treeDepth == TREE_DEPTH_MAX) { |
761 | ReportRegExpError(state, JSREPORT_ERROR, |
762 | JSMSG_REGEXP_TOO_COMPLEX); |
763 | goto out; |
764 | } |
765 | ++state->treeDepth; |
766 | /* FALL THROUGH */ |
767 | |
768 | case REOP_LPARENNON: |
769 | state->result = operandStack[operandSP - 1]; |
770 | if (!ParseQuantifier(state)) |
771 | goto out; |
772 | operandStack[operandSP - 1] = state->result; |
773 | goto restartOperator; |
774 | default: |
775 | if (!ProcessOp(state, &operatorStack[operatorSP], |
776 | operandStack, operandSP)) |
777 | goto out; |
778 | --operandSP; |
779 | break; |
780 | } |
781 | } |
782 | break; |
783 | |
784 | case '{': |
785 | { |
786 | const jschar *errp = state->cp; |
787 | |
788 | if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) { |
789 | /* |
790 | * This didn't even scan correctly as a quantifier, so we should |
791 | * treat it as flat. |
792 | */ |
793 | op = REOP_CONCAT; |
794 | goto pushOperator; |
795 | } |
796 | |
797 | state->cp = errp; |
798 | /* FALL THROUGH */ |
799 | } |
800 | |
801 | case '+': |
802 | case '*': |
803 | case '?': |
804 | ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER, |
805 | state->cp); |
806 | result = JS_FALSE; |
807 | goto out; |
808 | |
809 | default: |
810 | /* Anything else is the start of the next term. */ |
811 | op = REOP_CONCAT; |
812 | pushOperator: |
813 | if (operatorSP == operatorStackSize) { |
814 | REOpData *tmp; |
815 | operatorStackSize += operatorStackSize; |
816 | tmp = (REOpData *) |
817 | JS_realloc(state->context, operatorStack, |
818 | sizeof(REOpData) * operatorStackSize); |
819 | if (!tmp) |
820 | goto out; |
821 | operatorStack = tmp; |
822 | } |
823 | operatorStack[operatorSP].op = op; |
824 | operatorStack[operatorSP].errPos = state->cp; |
825 | operatorStack[operatorSP++].parenIndex = parenIndex; |
826 | break; |
827 | } |
828 | } |
829 | out: |
830 | if (operatorStack) |
831 | JS_free(state->context, operatorStack); |
832 | if (operandStack) |
833 | JS_free(state->context, operandStack); |
834 | return result; |
835 | } |
836 | |
837 | /* |
838 | * Hack two bits in CompilerState.flags, for use within FindParenCount to flag |
839 | * its being on the stack, and to propagate errors to its callers. |
840 | */ |
841 | #define JSREG_FIND_PAREN_COUNT 0x8000 |
842 | #define JSREG_FIND_PAREN_ERROR 0x4000 |
843 | |
844 | /* |
845 | * Magic return value from FindParenCount and GetDecimalValue, to indicate |
846 | * overflow beyond GetDecimalValue's max parameter, or a computed maximum if |
847 | * its findMax parameter is non-null. |
848 | */ |
849 | #define OVERFLOW_VALUE ((uintN)-1) |
850 | |
851 | static uintN |
852 | FindParenCount(CompilerState *state) |
853 | { |
854 | CompilerState temp; |
855 | int i; |
856 | |
857 | if (state->flags & JSREG_FIND_PAREN_COUNT) |
858 | return OVERFLOW_VALUE; |
859 | |
860 | /* |
861 | * Copy state into temp, flag it so we never report an invalid backref, |
862 | * and reset its members to parse the entire regexp. This is obviously |
863 | * suboptimal, but GetDecimalValue calls us only if a backref appears to |
864 | * refer to a forward parenthetical, which is rare. |
865 | */ |
866 | temp = *state; |
867 | temp.flags |= JSREG_FIND_PAREN_COUNT; |
868 | temp.cp = temp.cpbegin; |
869 | temp.parenCount = 0; |
870 | temp.classCount = 0; |
871 | temp.progLength = 0; |
872 | temp.treeDepth = 0; |
873 | temp.classBitmapsMem = 0; |
874 | for (i = 0; i < CLASS_CACHE_SIZE; i++) |
875 | temp.classCache[i].start = NULL; |
876 | |
877 | if (!ParseRegExp(&temp)) { |
878 | state->flags |= JSREG_FIND_PAREN_ERROR; |
879 | return OVERFLOW_VALUE; |
880 | } |
881 | return temp.parenCount; |
882 | } |
883 | |
884 | /* |
885 | * Extract and return a decimal value at state->cp. The initial character c |
886 | * has already been read. Return OVERFLOW_VALUE if the result exceeds max. |
887 | * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in |
888 | * state->flags to discover whether an error occurred under findMax. |
889 | */ |
890 | static uintN |
891 | GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state), |
892 | CompilerState *state) |
893 | { |
894 | uintN value = JS7_UNDEC(c); |
895 | JSBool overflow = (value > max && (!findMax || value > findMax(state))); |
896 | |
897 | /* The following restriction allows simpler overflow checks. */ |
898 | JS_ASSERT(max <= ((uintN)-1 - 9) / 10); |
899 | while (state->cp < state->cpend) { |
900 | c = *state->cp; |
901 | if (!JS7_ISDEC(c)) |
902 | break; |
903 | value = 10 * value + JS7_UNDEC(c); |
904 | if (!overflow && value > max && (!findMax || value > findMax(state))) |
905 | overflow = JS_TRUE; |
906 | ++state->cp; |
907 | } |
908 | return overflow ? OVERFLOW_VALUE : value; |
909 | } |
910 | |
911 | /* |
912 | * Calculate the total size of the bitmap required for a class expression. |
913 | */ |
914 | static JSBool |
915 | CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src, |
916 | const jschar *end) |
917 | { |
918 | uintN max = 0; |
919 | JSBool inRange = JS_FALSE; |
920 | jschar c, rangeStart = 0; |
921 | uintN n, digit, nDigits, i; |
922 | |
923 | target->u.ucclass.bmsize = 0; |
924 | target->u.ucclass.sense = JS_TRUE; |
925 | |
926 | if (src == end) |
927 | return JS_TRUE; |
928 | |
929 | if (*src == '^') { |
930 | ++src; |
931 | target->u.ucclass.sense = JS_FALSE; |
932 | } |
933 | |
934 | while (src != end) { |
935 | JSBool canStartRange = JS_TRUE; |
936 | uintN localMax = 0; |
937 | |
938 | switch (*src) { |
939 | case '\\': |
940 | ++src; |
941 | c = *src++; |
942 | switch (c) { |
943 | case 'b': |
944 | localMax = 0x8; |
945 | break; |
946 | case 'f': |
947 | localMax = 0xC; |
948 | break; |
949 | case 'n': |
950 | localMax = 0xA; |
951 | break; |
952 | case 'r': |
953 | localMax = 0xD; |
954 | break; |
955 | case 't': |
956 | localMax = 0x9; |
957 | break; |
958 | case 'v': |
959 | localMax = 0xB; |
960 | break; |
961 | case 'c': |
962 | if (src < end && RE_IS_LETTER(*src)) { |
963 | localMax = (uintN) (*src++) & 0x1F; |
964 | } else { |
965 | --src; |
966 | localMax = '\\'; |
967 | } |
968 | break; |
969 | case 'x': |
970 | nDigits = 2; |
971 | goto lexHex; |
972 | case 'u': |
973 | nDigits = 4; |
974 | lexHex: |
975 | n = 0; |
976 | for (i = 0; (i < nDigits) && (src < end); i++) { |
977 | c = *src++; |
978 | if (!isASCIIHexDigit(c, &digit)) { |
979 | /* |
980 | * Back off to accepting the original |
981 | *'\' as a literal. |
982 | */ |
983 | src -= i + 1; |
984 | n = '\\'; |
985 | break; |
986 | } |
987 | n = (n << 4) | digit; |
988 | } |
989 | localMax = n; |
990 | break; |
991 | case 'd': |
992 | canStartRange = JS_FALSE; |
993 | if (inRange) { |
994 | JS_ReportErrorNumber(state->context, |
995 | js_GetErrorMessage, NULL, |
996 | JSMSG_BAD_CLASS_RANGE); |
997 | return JS_FALSE; |
998 | } |
999 | localMax = '9'; |
1000 | break; |
1001 | case 'D': |
1002 | case 's': |
1003 | case 'S': |
1004 | case 'w': |
1005 | case 'W': |
1006 | canStartRange = JS_FALSE; |
1007 | if (inRange) { |
1008 | JS_ReportErrorNumber(state->context, |
1009 | js_GetErrorMessage, NULL, |
1010 | JSMSG_BAD_CLASS_RANGE); |
1011 | return JS_FALSE; |
1012 | } |
1013 | max = 65535; |
1014 | |
1015 | /* |
1016 | * If this is the start of a range, ensure that it's less than |
1017 | * the end. |
1018 | */ |
1019 | localMax = 0; |
1020 | break; |
1021 | case '0': |
1022 | case '1': |
1023 | case '2': |
1024 | case '3': |
1025 | case '4': |
1026 | case '5': |
1027 | case '6': |
1028 | case '7': |
1029 | /* |
1030 | * This is a non-ECMA extension - decimal escapes (in this |
1031 | * case, octal!) are supposed to be an error inside class |
1032 | * ranges, but supported here for backwards compatibility. |
1033 | * |
1034 | */ |
1035 | n = JS7_UNDEC(c); |
1036 | c = *src; |
1037 | if ('0' <= c && c <= '7') { |
1038 | src++; |
1039 | n = 8 * n + JS7_UNDEC(c); |
1040 | c = *src; |
1041 | if ('0' <= c && c <= '7') { |
1042 | src++; |
1043 | i = 8 * n + JS7_UNDEC(c); |
1044 | if (i <= 0377) |
1045 | n = i; |
1046 | else |
1047 | src--; |
1048 | } |
1049 | } |
1050 | localMax = n; |
1051 | break; |
1052 | |
1053 | default: |
1054 | localMax = c; |
1055 | break; |
1056 | } |
1057 | break; |
1058 | default: |
1059 | localMax = *src++; |
1060 | break; |
1061 | } |
1062 | |
1063 | if (inRange) { |
1064 | /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */ |
1065 | if (rangeStart > localMax) { |
1066 | JS_ReportErrorNumber(state->context, |
1067 | js_GetErrorMessage, NULL, |
1068 | JSMSG_BAD_CLASS_RANGE); |
1069 | return JS_FALSE; |
1070 | } |
1071 | inRange = JS_FALSE; |
1072 | } else { |
1073 | if (canStartRange && src < end - 1) { |
1074 | if (*src == '-') { |
1075 | ++src; |
1076 | inRange = JS_TRUE; |
1077 | rangeStart = (jschar)localMax; |
1078 | continue; |
1079 | } |
1080 | } |
1081 | if (state->flags & JSREG_FOLD) |
1082 | rangeStart = localMax; /* one run of the uc/dc loop below */ |
1083 | } |
1084 | |
1085 | if (state->flags & JSREG_FOLD) { |
1086 | jschar maxch = localMax; |
1087 | |
1088 | for (i = rangeStart; i <= localMax; i++) { |
1089 | jschar uch, dch; |
1090 | |
1091 | uch = upcase(i); |
1092 | dch = downcase(i); |
1093 | maxch = JS_MAX(maxch, uch); |
1094 | maxch = JS_MAX(maxch, dch); |
1095 | } |
1096 | localMax = maxch; |
1097 | } |
1098 | |
1099 | if (localMax > max) |
1100 | max = localMax; |
1101 | } |
1102 | target->u.ucclass.bmsize = max; |
1103 | return JS_TRUE; |
1104 | } |
1105 | |
1106 | /* |
1107 | * item: assertion An item is either an assertion or |
1108 | * quantatom a quantified atom. |
1109 | * |
1110 | * assertion: '^' Assertions match beginning of string |
1111 | * (or line if the class static property |
1112 | * RegExp.multiline is true). |
1113 | * '$' End of string (or line if the class |
1114 | * static property RegExp.multiline is |
1115 | * true). |
1116 | * '\b' Word boundary (between \w and \W). |
1117 | * '\B' Word non-boundary. |
1118 | * |
1119 | * quantatom: atom An unquantified atom. |
1120 | * quantatom '{' n ',' m '}' |
1121 | * Atom must occur between n and m times. |
1122 | * quantatom '{' n ',' '}' Atom must occur at least n times. |
1123 | * quantatom '{' n '}' Atom must occur exactly n times. |
1124 | * quantatom '*' Zero or more times (same as {0,}). |
1125 | * quantatom '+' One or more times (same as {1,}). |
1126 | * quantatom '?' Zero or one time (same as {0,1}). |
1127 | * |
1128 | * any of which can be optionally followed by '?' for ungreedy |
1129 | * |
1130 | * atom: '(' regexp ')' A parenthesized regexp (what matched |
1131 | * can be addressed using a backreference, |
1132 | * see '\' n below). |
1133 | * '.' Matches any char except '\n'. |
1134 | * '[' classlist ']' A character class. |
1135 | * '[' '^' classlist ']' A negated character class. |
1136 | * '\f' Form Feed. |
1137 | * '\n' Newline (Line Feed). |
1138 | * '\r' Carriage Return. |
1139 | * '\t' Horizontal Tab. |
1140 | * '\v' Vertical Tab. |
1141 | * '\d' A digit (same as [0-9]). |
1142 | * '\D' A non-digit. |
1143 | * '\w' A word character, [0-9a-z_A-Z]. |
1144 | * '\W' A non-word character. |
1145 | * '\s' A whitespace character, [ \b\f\n\r\t\v]. |
1146 | * '\S' A non-whitespace character. |
1147 | * '\' n A backreference to the nth (n decimal |
1148 | * and positive) parenthesized expression. |
1149 | * '\' octal An octal escape sequence (octal must be |
1150 | * two or three digits long, unless it is |
1151 | * 0 for the null character). |
1152 | * '\x' hex A hex escape (hex must be two digits). |
1153 | * '\u' unicode A unicode escape (must be four digits). |
1154 | * '\c' ctrl A control character, ctrl is a letter. |
1155 | * '\' literalatomchar Any character except one of the above |
1156 | * that follow '\' in an atom. |
1157 | * otheratomchar Any character not first among the other |
1158 | * atom right-hand sides. |
1159 | */ |
1160 | static JSBool |
1161 | ParseTerm(CompilerState *state) |
1162 | { |
1163 | jschar c = *state->cp++; |
1164 | uintN nDigits; |
1165 | uintN num, tmp, n, i; |
1166 | const jschar *termStart; |
1167 | |
1168 | switch (c) { |
1169 | /* assertions and atoms */ |
1170 | case '^': |
1171 | state->result = NewRENode(state, REOP_BOL); |
1172 | if (!state->result) |
1173 | return JS_FALSE; |
1174 | state->progLength++; |
1175 | return JS_TRUE; |
1176 | case '$': |
1177 | state->result = NewRENode(state, REOP_EOL); |
1178 | if (!state->result) |
1179 | return JS_FALSE; |
1180 | state->progLength++; |
1181 | return JS_TRUE; |
1182 | case '\\': |
1183 | if (state->cp >= state->cpend) { |
1184 | /* a trailing '\' is an error */ |
1185 | ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH); |
1186 | return JS_FALSE; |
1187 | } |
1188 | c = *state->cp++; |
1189 | switch (c) { |
1190 | /* assertion escapes */ |
1191 | case 'b' : |
1192 | state->result = NewRENode(state, REOP_WBDRY); |
1193 | if (!state->result) |
1194 | return JS_FALSE; |
1195 | state->progLength++; |
1196 | return JS_TRUE; |
1197 | case 'B': |
1198 | state->result = NewRENode(state, REOP_WNONBDRY); |
1199 | if (!state->result) |
1200 | return JS_FALSE; |
1201 | state->progLength++; |
1202 | return JS_TRUE; |
1203 | /* Decimal escape */ |
1204 | case '0': |
1205 | /* Give a strict warning. See also the note below. */ |
1206 | if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT, |
1207 | JSMSG_INVALID_BACKREF)) { |
1208 | return JS_FALSE; |
1209 | } |
1210 | doOctal: |
1211 | num = 0; |
1212 | while (state->cp < state->cpend) { |
1213 | c = *state->cp; |
1214 | if (c < '0' || '7' < c) |
1215 | break; |
1216 | state->cp++; |
1217 | tmp = 8 * num + (uintN)JS7_UNDEC(c); |
1218 | if (tmp > 0377) |
1219 | break; |
1220 | num = tmp; |
1221 | } |
1222 | c = (jschar)num; |
1223 | doFlat: |
1224 | state->result = NewRENode(state, REOP_FLAT); |
1225 | if (!state->result) |
1226 | return JS_FALSE; |
1227 | state->result->u.flat.chr = c; |
1228 | state->result->u.flat.length = 1; |
1229 | state->progLength += 3; |
1230 | break; |
1231 | case '1': |
1232 | case '2': |
1233 | case '3': |
1234 | case '4': |
1235 | case '5': |
1236 | case '6': |
1237 | case '7': |
1238 | case '8': |
1239 | case '9': |
1240 | termStart = state->cp - 1; |
1241 | num = GetDecimalValue(c, state->parenCount, FindParenCount, state); |
1242 | if (state->flags & JSREG_FIND_PAREN_ERROR) |
1243 | return JS_FALSE; |
1244 | if (num == OVERFLOW_VALUE) { |
1245 | /* Give a strict mode warning. */ |
1246 | if (!ReportRegExpError(state, |
1247 | JSREPORT_WARNING | JSREPORT_STRICT, |
1248 | (c >= '8') |
1249 | ? JSMSG_INVALID_BACKREF |
1250 | : JSMSG_BAD_BACKREF)) { |
1251 | return JS_FALSE; |
1252 | } |
1253 | |
1254 | /* |
1255 | * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax |
1256 | * error here. However, for compatibility with IE, we treat the |
1257 | * whole backref as flat if the first character in it is not a |
1258 | * valid octal character, and as an octal escape otherwise. |
1259 | */ |
1260 | state->cp = termStart; |
1261 | if (c >= '8') { |
1262 | /* Treat this as flat. termStart - 1 is the \. */ |
1263 | c = '\\'; |
1264 | goto asFlat; |
1265 | } |
1266 | |
1267 | /* Treat this as an octal escape. */ |
1268 | goto doOctal; |
1269 | } |
1270 | |
1271 | /* |
1272 | * When FindParenCount calls the regex parser recursively (to find |
1273 | * the number of backrefs) num can be arbitrary and the maximum |
1274 | * supported number of backrefs does not bound it. |
1275 | */ |
1276 | JS_ASSERT_IF(!(state->flags & JSREG_FIND_PAREN_COUNT), |
1277 | 1 <= num && num <= 0x10000); |
1278 | state->result = NewRENode(state, REOP_BACKREF); |
1279 | if (!state->result) |
1280 | return JS_FALSE; |
1281 | state->result->u.parenIndex = num - 1; |
1282 | state->progLength |
1283 | += 1 + GetCompactIndexWidth(state->result->u.parenIndex); |
1284 | break; |
1285 | /* Control escape */ |
1286 | case 'f': |
1287 | c = 0xC; |
1288 | goto doFlat; |
1289 | case 'n': |
1290 | c = 0xA; |
1291 | goto doFlat; |
1292 | case 'r': |
1293 | c = 0xD; |
1294 | goto doFlat; |
1295 | case 't': |
1296 | c = 0x9; |
1297 | goto doFlat; |
1298 | case 'v': |
1299 | c = 0xB; |
1300 | goto doFlat; |
1301 | /* Control letter */ |
1302 | case 'c': |
1303 | if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) { |
1304 | c = (jschar) (*state->cp++ & 0x1F); |
1305 | } else { |
1306 | /* back off to accepting the original '\' as a literal */ |
1307 | --state->cp; |
1308 | c = '\\'; |
1309 | } |
1310 | goto doFlat; |
1311 | /* HexEscapeSequence */ |
1312 | case 'x': |
1313 | nDigits = 2; |
1314 | goto lexHex; |
1315 | /* UnicodeEscapeSequence */ |
1316 | case 'u': |
1317 | nDigits = 4; |
1318 | lexHex: |
1319 | n = 0; |
1320 | for (i = 0; i < nDigits && state->cp < state->cpend; i++) { |
1321 | uintN digit; |
1322 | c = *state->cp++; |
1323 | if (!isASCIIHexDigit(c, &digit)) { |
1324 | /* |
1325 | * Back off to accepting the original 'u' or 'x' as a |
1326 | * literal. |
1327 | */ |
1328 | state->cp -= i + 2; |
1329 | n = *state->cp++; |
1330 | break; |
1331 | } |
1332 | n = (n << 4) | digit; |
1333 | } |
1334 | c = (jschar) n; |
1335 | goto doFlat; |
1336 | /* Character class escapes */ |
1337 | case 'd': |
1338 | state->result = NewRENode(state, REOP_DIGIT); |
1339 | doSimple: |
1340 | if (!state->result) |
1341 | return JS_FALSE; |
1342 | state->progLength++; |
1343 | break; |
1344 | case 'D': |
1345 | state->result = NewRENode(state, REOP_NONDIGIT); |
1346 | goto doSimple; |
1347 | case 's': |
1348 | state->result = NewRENode(state, REOP_SPACE); |
1349 | goto doSimple; |
1350 | case 'S': |
1351 | state->result = NewRENode(state, REOP_NONSPACE); |
1352 | goto doSimple; |
1353 | case 'w': |
1354 | state->result = NewRENode(state, REOP_ALNUM); |
1355 | goto doSimple; |
1356 | case 'W': |
1357 | state->result = NewRENode(state, REOP_NONALNUM); |
1358 | goto doSimple; |
1359 | /* IdentityEscape */ |
1360 | default: |
1361 | state->result = NewRENode(state, REOP_FLAT); |
1362 | if (!state->result) |
1363 | return JS_FALSE; |
1364 | state->result->u.flat.chr = c; |
1365 | state->result->u.flat.length = 1; |
1366 | state->result->kid = (void *) (state->cp - 1); |
1367 | state->progLength += 3; |
1368 | break; |
1369 | } |
1370 | break; |
1371 | case '[': |
1372 | state->result = NewRENode(state, REOP_CLASS); |
1373 | if (!state->result) |
1374 | return JS_FALSE; |
1375 | termStart = state->cp; |
1376 | state->result->u.ucclass.startIndex = termStart - state->cpbegin; |
1377 | for (;;) { |
1378 | if (state->cp == state->cpend) { |
1379 | ReportRegExpErrorHelper(state, JSREPORT_ERROR, |
1380 | JSMSG_UNTERM_CLASS, termStart); |
1381 | |
1382 | return JS_FALSE; |
1383 | } |
1384 | if (*state->cp == '\\') { |
1385 | state->cp++; |
1386 | if (state->cp != state->cpend) |
1387 | state->cp++; |
1388 | continue; |
1389 | } |
1390 | if (*state->cp == ']') { |
1391 | state->result->u.ucclass.kidlen = state->cp - termStart; |
1392 | break; |
1393 | } |
1394 | state->cp++; |
1395 | } |
1396 | for (i = 0; i < CLASS_CACHE_SIZE; i++) { |
1397 | if (!state->classCache[i].start) { |
1398 | state->classCache[i].start = termStart; |
1399 | state->classCache[i].length = state->result->u.ucclass.kidlen; |
1400 | state->classCache[i].index = state->classCount; |
1401 | break; |
1402 | } |
1403 | if (state->classCache[i].length == |
1404 | state->result->u.ucclass.kidlen) { |
1405 | for (n = 0; ; n++) { |
1406 | if (n == state->classCache[i].length) { |
1407 | state->result->u.ucclass.index |
1408 | = state->classCache[i].index; |
1409 | goto claim; |
1410 | } |
1411 | if (state->classCache[i].start[n] != termStart[n]) |
1412 | break; |
1413 | } |
1414 | } |
1415 | } |
1416 | state->result->u.ucclass.index = state->classCount++; |
1417 | |
1418 | claim: |
1419 | /* |
1420 | * Call CalculateBitmapSize now as we want any errors it finds |
1421 | * to be reported during the parse phase, not at execution. |
1422 | */ |
1423 | if (!CalculateBitmapSize(state, state->result, termStart, state->cp++)) |
1424 | return JS_FALSE; |
1425 | /* |
1426 | * Update classBitmapsMem with number of bytes to hold bmsize bits, |
1427 | * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8 |
1428 | * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize. |
1429 | */ |
1430 | n = (state->result->u.ucclass.bmsize >> 3) + 1; |
1431 | if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) { |
1432 | ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); |
1433 | return JS_FALSE; |
1434 | } |
1435 | state->classBitmapsMem += n; |
1436 | /* CLASS, <index> */ |
1437 | state->progLength |
1438 | += 1 + GetCompactIndexWidth(state->result->u.ucclass.index); |
1439 | break; |
1440 | |
1441 | case '.': |
1442 | state->result = NewRENode(state, REOP_DOT); |
1443 | goto doSimple; |
1444 | |
1445 | case '{': |
1446 | { |
1447 | const jschar *errp = state->cp--; |
1448 | intN err; |
1449 | |
1450 | err = ParseMinMaxQuantifier(state, JS_TRUE); |
1451 | state->cp = errp; |
1452 | |
1453 | if (err < 0) |
1454 | goto asFlat; |
1455 | |
1456 | /* FALL THROUGH */ |
1457 | } |
1458 | case '*': |
1459 | case '+': |
1460 | case '?': |
1461 | ReportRegExpErrorHelper(state, JSREPORT_ERROR, |
1462 | JSMSG_BAD_QUANTIFIER, state->cp - 1); |
1463 | return JS_FALSE; |
1464 | default: |
1465 | asFlat: |
1466 | state->result = NewRENode(state, REOP_FLAT); |
1467 | if (!state->result) |
1468 | return JS_FALSE; |
1469 | state->result->u.flat.chr = c; |
1470 | state->result->u.flat.length = 1; |
1471 | state->result->kid = (void *) (state->cp - 1); |
1472 | state->progLength += 3; |
1473 | break; |
1474 | } |
1475 | return ParseQuantifier(state); |
1476 | } |
1477 | |
1478 | static JSBool |
1479 | ParseQuantifier(CompilerState *state) |
1480 | { |
1481 | RENode *term; |
1482 | term = state->result; |
1483 | if (state->cp < state->cpend) { |
1484 | switch (*state->cp) { |
1485 | case '+': |
1486 | state->result = NewRENode(state, REOP_QUANT); |
1487 | if (!state->result) |
1488 | return JS_FALSE; |
1489 | state->result->u.range.min = 1; |
1490 | state->result->u.range.max = (uintN)-1; |
1491 | /* <PLUS>, <next> ... <ENDCHILD> */ |
1492 | state->progLength += 4; |
1493 | goto quantifier; |
1494 | case '*': |
1495 | state->result = NewRENode(state, REOP_QUANT); |
1496 | if (!state->result) |
1497 | return JS_FALSE; |
1498 | state->result->u.range.min = 0; |
1499 | state->result->u.range.max = (uintN)-1; |
1500 | /* <STAR>, <next> ... <ENDCHILD> */ |
1501 | state->progLength += 4; |
1502 | goto quantifier; |
1503 | case '?': |
1504 | state->result = NewRENode(state, REOP_QUANT); |
1505 | if (!state->result) |
1506 | return JS_FALSE; |
1507 | state->result->u.range.min = 0; |
1508 | state->result->u.range.max = 1; |
1509 | /* <OPT>, <next> ... <ENDCHILD> */ |
1510 | state->progLength += 4; |
1511 | goto quantifier; |
1512 | case '{': /* balance '}' */ |
1513 | { |
1514 | intN err; |
1515 | const jschar *errp = state->cp; |
1516 | |
1517 | err = ParseMinMaxQuantifier(state, JS_FALSE); |
1518 | if (err == 0) |
1519 | goto quantifier; |
1520 | if (err == -1) |
1521 | return JS_TRUE; |
1522 | |
1523 | ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp); |
1524 | return JS_FALSE; |
1525 | } |
1526 | default:; |
1527 | } |
1528 | } |
1529 | return JS_TRUE; |
1530 | |
1531 | quantifier: |
1532 | if (state->treeDepth == TREE_DEPTH_MAX) { |
1533 | ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); |
1534 | return JS_FALSE; |
1535 | } |
1536 | |
1537 | ++state->treeDepth; |
1538 | ++state->cp; |
1539 | state->result->kid = term; |
1540 | if (state->cp < state->cpend && *state->cp == '?') { |
1541 | ++state->cp; |
1542 | state->result->u.range.greedy = JS_FALSE; |
1543 | } else { |
1544 | state->result->u.range.greedy = JS_TRUE; |
1545 | } |
1546 | return JS_TRUE; |
1547 | } |
1548 | |
1549 | static intN |
1550 | ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues) |
1551 | { |
1552 | uintN min, max; |
1553 | jschar c; |
1554 | const jschar *errp = state->cp++; |
1555 | |
1556 | c = *state->cp; |
1557 | if (JS7_ISDEC(c)) { |
1558 | ++state->cp; |
1559 | min = GetDecimalValue(c, 0xFFFF, NULL, state); |
1560 | c = *state->cp; |
1561 | |
1562 | if (!ignoreValues && min == OVERFLOW_VALUE) |
1563 | return JSMSG_MIN_TOO_BIG; |
1564 | |
1565 | if (c == ',') { |
1566 | c = *++state->cp; |
1567 | if (JS7_ISDEC(c)) { |
1568 | ++state->cp; |
1569 | max = GetDecimalValue(c, 0xFFFF, NULL, state); |
1570 | c = *state->cp; |
1571 | if (!ignoreValues && max == OVERFLOW_VALUE) |
1572 | return JSMSG_MAX_TOO_BIG; |
1573 | if (!ignoreValues && min > max) |
1574 | return JSMSG_OUT_OF_ORDER; |
1575 | } else { |
1576 | max = (uintN)-1; |
1577 | } |
1578 | } else { |
1579 | max = min; |
1580 | } |
1581 | if (c == '}') { |
1582 | state->result = NewRENode(state, REOP_QUANT); |
1583 | if (!state->result) |
1584 | return JSMSG_OUT_OF_MEMORY; |
1585 | state->result->u.range.min = min; |
1586 | state->result->u.range.max = max; |
1587 | /* |
1588 | * QUANT, <min>, <max>, <next> ... <ENDCHILD> |
1589 | * where <max> is written as compact(max+1) to make |
1590 | * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1. |
1591 | */ |
1592 | state->progLength += (1 + GetCompactIndexWidth(min) |
1593 | + GetCompactIndexWidth(max + 1) |
1594 | +3); |
1595 | return 0; |
1596 | } |
1597 | } |
1598 | |
1599 | state->cp = errp; |
1600 | return -1; |
1601 | } |
1602 | |
1603 | static JSBool |
1604 | SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target) |
1605 | { |
1606 | ptrdiff_t offset = target - jump; |
1607 | |
1608 | /* Check that target really points forward. */ |
1609 | JS_ASSERT(offset >= 2); |
1610 | if ((size_t)offset > OFFSET_MAX) |
1611 | return JS_FALSE; |
1612 | |
1613 | jump[0] = JUMP_OFFSET_HI(offset); |
1614 | jump[1] = JUMP_OFFSET_LO(offset); |
1615 | return JS_TRUE; |
1616 | } |
1617 | |
1618 | /* Copy the charset data from a character class node to the charset list |
1619 | * in the regexp object. */ |
1620 | static JS_ALWAYS_INLINE RECharSet * |
1621 | InitNodeCharSet(JSRegExp *re, RENode *node) |
1622 | { |
1623 | RECharSet *charSet = &re->classList[node->u.ucclass.index]; |
1624 | charSet->converted = JS_FALSE; |
1625 | charSet->length = node->u.ucclass.bmsize; |
1626 | charSet->u.src.startIndex = node->u.ucclass.startIndex; |
1627 | charSet->u.src.length = node->u.ucclass.kidlen; |
1628 | charSet->sense = node->u.ucclass.sense; |
1629 | return charSet; |
1630 | } |
1631 | |
1632 | /* |
1633 | * Generate bytecode for the tree rooted at t using an explicit stack instead |
1634 | * of recursion. |
1635 | */ |
1636 | static jsbytecode * |
1637 | EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth, |
1638 | jsbytecode *pc, RENode *t) |
1639 | { |
1640 | EmitStateStackEntry *emitStateSP, *emitStateStack; |
1641 | REOp op; |
1642 | |
1643 | if (treeDepth == 0) { |
1644 | emitStateStack = NULL; |
1645 | } else { |
1646 | emitStateStack = |
1647 | (EmitStateStackEntry *)JS_malloc(state->context, |
1648 | sizeof(EmitStateStackEntry) * |
1649 | treeDepth); |
1650 | if (!emitStateStack) |
1651 | return NULL; |
1652 | } |
1653 | emitStateSP = emitStateStack; |
1654 | op = t->op; |
1655 | JS_ASSERT(op < REOP_LIMIT); |
1656 | |
1657 | for (;;) { |
1658 | *pc++ = op; |
1659 | switch (op) { |
1660 | case REOP_EMPTY: |
1661 | --pc; |
1662 | break; |
1663 | |
1664 | case REOP_ALTPREREQ2: |
1665 | case REOP_ALTPREREQ: |
1666 | JS_ASSERT(emitStateSP); |
1667 | emitStateSP->altHead = pc - 1; |
1668 | emitStateSP->endTermFixup = pc; |
1669 | pc += OFFSET_LEN; |
1670 | SET_ARG(pc, t->u.altprereq.ch1); |
1671 | pc += ARG_LEN; |
1672 | SET_ARG(pc, t->u.altprereq.ch2); |
1673 | pc += ARG_LEN; |
1674 | |
1675 | emitStateSP->nextAltFixup = pc; /* offset to next alternate */ |
1676 | pc += OFFSET_LEN; |
1677 | |
1678 | emitStateSP->continueNode = t; |
1679 | emitStateSP->continueOp = REOP_JUMP; |
1680 | emitStateSP->jumpToJumpFlag = JS_FALSE; |
1681 | ++emitStateSP; |
1682 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1683 | t = (RENode *) t->kid; |
1684 | op = t->op; |
1685 | JS_ASSERT(op < REOP_LIMIT); |
1686 | continue; |
1687 | |
1688 | case REOP_JUMP: |
1689 | emitStateSP->nextTermFixup = pc; /* offset to following term */ |
1690 | pc += OFFSET_LEN; |
1691 | if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc)) |
1692 | goto jump_too_big; |
1693 | emitStateSP->continueOp = REOP_ENDALT; |
1694 | ++emitStateSP; |
1695 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1696 | t = (RENode *) t->u.kid2; |
1697 | op = t->op; |
1698 | JS_ASSERT(op < REOP_LIMIT); |
1699 | continue; |
1700 | |
1701 | case REOP_ENDALT: |
1702 | /* |
1703 | * If we already patched emitStateSP->nextTermFixup to jump to |
1704 | * a nearer jump, to avoid 16-bit immediate offset overflow, we |
1705 | * are done here. |
1706 | */ |
1707 | if (emitStateSP->jumpToJumpFlag) |
1708 | break; |
1709 | |
1710 | /* |
1711 | * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT. |
1712 | * REOP_ENDALT is executed only on successful match of the last |
1713 | * alternate in a group. |
1714 | */ |
1715 | if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) |
1716 | goto jump_too_big; |
1717 | if (t->op != REOP_ALT) { |
1718 | if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc)) |
1719 | goto jump_too_big; |
1720 | } |
1721 | |
1722 | /* |
1723 | * If the program is bigger than the REOP_JUMP offset range, then |
1724 | * we must check for alternates before this one that are part of |
1725 | * the same group, and fix up their jump offsets to target jumps |
1726 | * close enough to fit in a 16-bit unsigned offset immediate. |
1727 | */ |
1728 | if ((size_t)(pc - re->program) > OFFSET_MAX && |
1729 | emitStateSP > emitStateStack) { |
1730 | EmitStateStackEntry *esp, *esp2; |
1731 | jsbytecode *alt, *jump; |
1732 | ptrdiff_t span, header; |
1733 | |
1734 | esp2 = emitStateSP; |
1735 | alt = esp2->altHead; |
1736 | for (esp = esp2 - 1; esp >= emitStateStack; --esp) { |
1737 | if (esp->continueOp == REOP_ENDALT && |
1738 | !esp->jumpToJumpFlag && |
1739 | esp->nextTermFixup + OFFSET_LEN == alt && |
1740 | (size_t)(pc - ((esp->continueNode->op != REOP_ALT) |
1741 | ? esp->endTermFixup |
1742 | : esp->nextTermFixup)) > OFFSET_MAX) { |
1743 | alt = esp->altHead; |
1744 | jump = esp->nextTermFixup; |
1745 | |
1746 | /* |
1747 | * The span must be 1 less than the distance from |
1748 | * jump offset to jump offset, so we actually jump |
1749 | * to a REOP_JUMP bytecode, not to its offset! |
1750 | */ |
1751 | for (;;) { |
1752 | JS_ASSERT(jump < esp2->nextTermFixup); |
1753 | span = esp2->nextTermFixup - jump - 1; |
1754 | if ((size_t)span <= OFFSET_MAX) |
1755 | break; |
1756 | do { |
1757 | if (--esp2 == esp) |
1758 | goto jump_too_big; |
1759 | } while (esp2->continueOp != REOP_ENDALT); |
1760 | } |
1761 | |
1762 | jump[0] = JUMP_OFFSET_HI(span); |
1763 | jump[1] = JUMP_OFFSET_LO(span); |
1764 | |
1765 | if (esp->continueNode->op != REOP_ALT) { |
1766 | /* |
1767 | * We must patch the offset at esp->endTermFixup |
1768 | * as well, for the REOP_ALTPREREQ{,2} opcodes. |
1769 | * If we're unlucky and endTermFixup is more than |
1770 | * OFFSET_MAX bytes from its target, we cheat by |
1771 | * jumping 6 bytes to the jump whose offset is at |
1772 | * esp->nextTermFixup, which has the same target. |
1773 | */ |
1774 | jump = esp->endTermFixup; |
1775 | header = esp->nextTermFixup - jump; |
1776 | span += header; |
1777 | if ((size_t)span > OFFSET_MAX) |
1778 | span = header; |
1779 | |
1780 | jump[0] = JUMP_OFFSET_HI(span); |
1781 | jump[1] = JUMP_OFFSET_LO(span); |
1782 | } |
1783 | |
1784 | esp->jumpToJumpFlag = JS_TRUE; |
1785 | } |
1786 | } |
1787 | } |
1788 | break; |
1789 | |
1790 | case REOP_ALT: |
1791 | JS_ASSERT(emitStateSP); |
1792 | emitStateSP->altHead = pc - 1; |
1793 | emitStateSP->nextAltFixup = pc; /* offset to next alternate */ |
1794 | pc += OFFSET_LEN; |
1795 | emitStateSP->continueNode = t; |
1796 | emitStateSP->continueOp = REOP_JUMP; |
1797 | emitStateSP->jumpToJumpFlag = JS_FALSE; |
1798 | ++emitStateSP; |
1799 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1800 | t = (RENode *) t->kid; |
1801 | op = t->op; |
1802 | JS_ASSERT(op < REOP_LIMIT); |
1803 | continue; |
1804 | |
1805 | case REOP_FLAT: |
1806 | /* |
1807 | * Coalesce FLATs if possible and if it would not increase bytecode |
1808 | * beyond preallocated limit. The latter happens only when bytecode |
1809 | * size for coalesced string with offset p and length 2 exceeds 6 |
1810 | * bytes preallocated for 2 single char nodes, i.e. when |
1811 | * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or |
1812 | * GetCompactIndexWidth(p) > 4. |
1813 | * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more |
1814 | * nodes strictly decreases bytecode size, the check has to be |
1815 | * done only for the first coalescing. |
1816 | */ |
1817 | if (t->kid && |
1818 | GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4) |
1819 | { |
1820 | while (t->next && |
1821 | t->next->op == REOP_FLAT && |
1822 | (jschar*)t->kid + t->u.flat.length == |
1823 | (jschar*)t->next->kid) { |
1824 | t->u.flat.length += t->next->u.flat.length; |
1825 | t->next = t->next->next; |
1826 | } |
1827 | } |
1828 | if (t->kid && t->u.flat.length > 1) { |
1829 | pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT; |
1830 | pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin); |
1831 | pc = WriteCompactIndex(pc, t->u.flat.length); |
1832 | } else if (t->u.flat.chr < 256) { |
1833 | pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1; |
1834 | *pc++ = (jsbytecode) t->u.flat.chr; |
1835 | } else { |
1836 | pc[-1] = (state->flags & JSREG_FOLD) |
1837 | ? REOP_UCFLAT1i |
1838 | : REOP_UCFLAT1; |
1839 | SET_ARG(pc, t->u.flat.chr); |
1840 | pc += ARG_LEN; |
1841 | } |
1842 | break; |
1843 | |
1844 | case REOP_LPAREN: |
1845 | JS_ASSERT(emitStateSP); |
1846 | pc = WriteCompactIndex(pc, t->u.parenIndex); |
1847 | emitStateSP->continueNode = t; |
1848 | emitStateSP->continueOp = REOP_RPAREN; |
1849 | ++emitStateSP; |
1850 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1851 | t = (RENode *) t->kid; |
1852 | op = t->op; |
1853 | continue; |
1854 | |
1855 | case REOP_RPAREN: |
1856 | pc = WriteCompactIndex(pc, t->u.parenIndex); |
1857 | break; |
1858 | |
1859 | case REOP_BACKREF: |
1860 | pc = WriteCompactIndex(pc, t->u.parenIndex); |
1861 | break; |
1862 | |
1863 | case REOP_ASSERT: |
1864 | JS_ASSERT(emitStateSP); |
1865 | emitStateSP->nextTermFixup = pc; |
1866 | pc += OFFSET_LEN; |
1867 | emitStateSP->continueNode = t; |
1868 | emitStateSP->continueOp = REOP_ASSERTTEST; |
1869 | ++emitStateSP; |
1870 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1871 | t = (RENode *) t->kid; |
1872 | op = t->op; |
1873 | continue; |
1874 | |
1875 | case REOP_ASSERTTEST: |
1876 | case REOP_ASSERTNOTTEST: |
1877 | if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) |
1878 | goto jump_too_big; |
1879 | break; |
1880 | |
1881 | case REOP_ASSERT_NOT: |
1882 | JS_ASSERT(emitStateSP); |
1883 | emitStateSP->nextTermFixup = pc; |
1884 | pc += OFFSET_LEN; |
1885 | emitStateSP->continueNode = t; |
1886 | emitStateSP->continueOp = REOP_ASSERTNOTTEST; |
1887 | ++emitStateSP; |
1888 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1889 | t = (RENode *) t->kid; |
1890 | op = t->op; |
1891 | continue; |
1892 | |
1893 | case REOP_QUANT: |
1894 | JS_ASSERT(emitStateSP); |
1895 | if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) { |
1896 | pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR; |
1897 | } else if (t->u.range.min == 0 && t->u.range.max == 1) { |
1898 | pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT; |
1899 | } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) { |
1900 | pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; |
1901 | } else { |
1902 | if (!t->u.range.greedy) |
1903 | pc[-1] = REOP_MINIMALQUANT; |
1904 | pc = WriteCompactIndex(pc, t->u.range.min); |
1905 | /* |
1906 | * Write max + 1 to avoid using size_t(max) + 1 bytes |
1907 | * for (uintN)-1 sentinel. |
1908 | */ |
1909 | pc = WriteCompactIndex(pc, t->u.range.max + 1); |
1910 | } |
1911 | emitStateSP->nextTermFixup = pc; |
1912 | pc += OFFSET_LEN; |
1913 | emitStateSP->continueNode = t; |
1914 | emitStateSP->continueOp = REOP_ENDCHILD; |
1915 | ++emitStateSP; |
1916 | JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); |
1917 | t = (RENode *) t->kid; |
1918 | op = t->op; |
1919 | continue; |
1920 | |
1921 | case REOP_ENDCHILD: |
1922 | if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) |
1923 | goto jump_too_big; |
1924 | break; |
1925 | |
1926 | case REOP_CLASS: |
1927 | if (!t->u.ucclass.sense) |
1928 | pc[-1] = REOP_NCLASS; |
1929 | pc = WriteCompactIndex(pc, t->u.ucclass.index); |
1930 | InitNodeCharSet(re, t); |
1931 | break; |
1932 | |
1933 | default: |
1934 | break; |
1935 | } |
1936 | |
1937 | t = t->next; |
1938 | if (t) { |
1939 | op = t->op; |
1940 | } else { |
1941 | if (emitStateSP == emitStateStack) |
1942 | break; |
1943 | --emitStateSP; |
1944 | t = emitStateSP->continueNode; |
1945 | op = (REOp) emitStateSP->continueOp; |
1946 | } |
1947 | } |
1948 | |
1949 | cleanup: |
1950 | if (emitStateStack) |
1951 | JS_free(state->context, emitStateStack); |
1952 | return pc; |
1953 | |
1954 | jump_too_big: |
1955 | ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); |
1956 | pc = NULL; |
1957 | goto cleanup; |
1958 | } |
1959 | |
1960 | static JSBool |
1961 | CompileRegExpToAST(JSContext* cx, JSTokenStream* ts, |
1962 | JSString* str, uintN flags, CompilerState& state) |
1963 | { |
1964 | uintN i; |
1965 | size_t len; |
1966 | |
1967 | len = JSSTRING_LENGTH(str); |
1968 | |
1969 | state.context = cx; |
1970 | state.tokenStream = ts; |
1971 | state.cp = js_UndependString(cx, str); |
1972 | if (!state.cp) |
1973 | return JS_FALSE; |
1974 | state.cpbegin = state.cp; |
1975 | state.cpend = state.cp + len; |
1976 | state.flags = flags; |
1977 | state.parenCount = 0; |
1978 | state.classCount = 0; |
1979 | state.progLength = 0; |
1980 | state.treeDepth = 0; |
1981 | state.classBitmapsMem = 0; |
1982 | for (i = 0; i < CLASS_CACHE_SIZE; i++) |
1983 | state.classCache[i].start = NULL; |
1984 | |
1985 | if (len != 0 && (flags & JSREG_FLAT)) { |
1986 | state.result = NewRENode(&state, REOP_FLAT); |
1987 | if (!state.result) |
1988 | return JS_FALSE; |
1989 | state.result->u.flat.chr = *state.cpbegin; |
1990 | state.result->u.flat.length = len; |
1991 | state.result->kid = (void *) state.cpbegin; |
1992 | /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */ |
1993 | state.progLength += 1 + GetCompactIndexWidth(0) |
1994 | + GetCompactIndexWidth(len); |
1995 | return JS_TRUE; |
1996 | } |
1997 | |
1998 | return ParseRegExp(&state); |
1999 | } |
2000 | |
2001 | #ifdef JS_TRACER |
2002 | typedef List<LIns*, LIST_NonGCObjects> LInsList; |
2003 | |
2004 | /* Dummy GC for nanojit placement new. */ |
2005 | static GC gc; |
2006 | |
2007 | static void* |
2008 | HashRegExp(uint16 flags, jschar* s, size_t n) |
2009 | { |
2010 | uint32 h; |
2011 | |
2012 | for (h = 0; n; s++, n--) |
2013 | h = JS_ROTATE_LEFT32(h, 4) ^ *s; |
2014 | return (void*)(h + flags); |
2015 | } |
2016 | |
2017 | struct RESideExit : public SideExit { |
2018 | size_t re_length; |
2019 | uint16 re_flags; |
2020 | jschar re_chars[1]; |
2021 | }; |
2022 | |
2023 | /* Return the cached fragment for the given regexp, or NULL. */ |
2024 | static Fragment* |
2025 | LookupNativeRegExp(JSContext* cx, void* hash, uint16 re_flags, |
2026 | jschar* re_chars, size_t re_length) |
2027 | { |
2028 | Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento; |
2029 | Fragment* fragment = fragmento->getLoop(hash); |
2030 | while (fragment) { |
2031 | if (fragment->lastIns) { |
2032 | RESideExit* exit = (RESideExit*)fragment->lastIns->record()->exit; |
2033 | if (exit->re_flags == re_flags && |
2034 | exit->re_length == re_length && |
2035 | !memcmp(exit->re_chars, re_chars, re_length * sizeof(jschar))) { |
2036 | return fragment; |
2037 | } |
2038 | } |
2039 | fragment = fragment->peer; |
2040 | } |
2041 | return NULL; |
2042 | } |
2043 | |
2044 | static JSBool |
2045 | ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet); |
2046 | |
2047 | class RegExpNativeCompiler { |
2048 | private: |
2049 | JSContext* cx; |
2050 | JSRegExp* re; |
2051 | CompilerState* cs; /* RegExp to compile */ |
2052 | Fragment* fragment; |
2053 | LirWriter* lir; |
2054 | LirBufWriter* lirBufWriter; /* for skip */ |
2055 | |
2056 | LIns* state; |
2057 | LIns* gdata; |
2058 | LIns* cpend; |
2059 | |
2060 | JSBool isCaseInsensitive() const { return (cs->flags & JSREG_FOLD) != 0; } |
2061 | |
2062 | JSBool targetCurrentPoint(LIns* ins) |
2063 | { |
2064 | if (fragment->lirbuf->outOMem()) |
2065 | return JS_FALSE; |
2066 | ins->target(lir->ins0(LIR_label)); |
2067 | return JS_TRUE; |
2068 | } |
2069 | |
2070 | JSBool targetCurrentPoint(LInsList& fails) |
2071 | { |
2072 | if (fragment->lirbuf->outOMem()) |
2073 | return JS_FALSE; |
2074 | LIns* fail = lir->ins0(LIR_label); |
2075 | for (size_t i = 0; i < fails.size(); ++i) { |
2076 | fails[i]->target(fail); |
2077 | } |
2078 | fails.clear(); |
2079 | return JS_TRUE; |
2080 | } |
2081 | |
2082 | /* |
2083 | * These functions return the new position after their match operation, |
2084 | * or NULL if there was an error. |
2085 | */ |
2086 | LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails) |
2087 | { |
2088 | return pos; |
2089 | } |
2090 | |
2091 | LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails) |
2092 | { |
2093 | /* |
2094 | * Fast case-insensitive test for ASCII letters: convert text |
2095 | * char to lower case by bit-or-ing in 32 and compare. |
2096 | */ |
2097 | JSBool useFastCI = JS_FALSE; |
2098 | jschar ch2 = ch; /* 2nd char to test for if ci */ |
2099 | if (cs->flags & JSREG_FOLD) { |
2100 | if ((L'A' <= ch && ch <= L'Z') || (L'a' <= ch && ch <= L'z')) { |
2101 | ch |= 32; |
2102 | ch2 = ch; |
2103 | useFastCI = JS_TRUE; |
2104 | } else if (JS_TOLOWER(ch) != ch) { |
2105 | ch2 = JS_TOLOWER(ch); |
2106 | ch = JS_TOUPPER(ch); |
2107 | } |
2108 | } |
2109 | |
2110 | LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0); |
2111 | fails.add(to_fail); |
2112 | LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0)); |
2113 | LIns* comp_ch = useFastCI ? |
2114 | lir->ins2(LIR_or, text_ch, lir->insImm(32)) : |
2115 | text_ch; |
2116 | if (ch == ch2) { |
2117 | fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0)); |
2118 | } else { |
2119 | LIns* to_ok = lir->insBranch(LIR_jt, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0); |
2120 | fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch2)), 0)); |
2121 | if (!targetCurrentPoint(to_ok)) |
2122 | return NULL; |
2123 | } |
2124 | |
2125 | return lir->ins2(LIR_piadd, pos, lir->insImm(2)); |
2126 | } |
2127 | |
2128 | LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, |
2129 | LInsList& fails) |
2130 | { |
2131 | #ifdef IS_BIG_ENDIAN |
2132 | uint32 word = (ch1 << 16) | ch2; |
2133 | #else |
2134 | uint32 word = (ch2 << 16) | ch1; |
2135 | #endif |
2136 | /* |
2137 | * Fast case-insensitive test for ASCII letters: convert text |
2138 | * char to lower case by bit-or-ing in 32 and compare. |
2139 | */ |
2140 | JSBool useFastCI = JS_FALSE; |
2141 | union { jschar c[2]; uint32 i; } mask; |
2142 | if (cs->flags & JSREG_FOLD) { |
2143 | JSBool mask1 = (L'A' <= ch1 && ch1 <= L'Z') || (L'a' <= ch1 && ch1 <= L'z'); |
2144 | JSBool mask2 = (L'A' <= ch2 && ch2 <= L'Z') || (L'a' <= ch2 && ch2 <= L'z'); |
2145 | if ((!mask1 && JS_TOLOWER(ch1) != ch1) || (!mask2 && JS_TOLOWER(ch2) != ch2)) { |
2146 | pos = compileFlatSingleChar(ch1, pos, fails); |
2147 | if (!pos) return NULL; |
2148 | return compileFlatSingleChar(ch2, pos, fails); |
2149 | } |
2150 | |
2151 | mask.c[0] = mask1 ? 0x0020 : 0x0; |
2152 | mask.c[1] = mask2 ? 0x0020 : 0x0; |
2153 | |
2154 | if (mask.i) { |
2155 | word |= mask.i; |
2156 | useFastCI = JS_TRUE; |
2157 | } |
2158 | } |
2159 | |
2160 | LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, lir->ins2(LIR_sub, cpend, lir->insImm(2))), 0); |
2161 | fails.add(to_fail); |
2162 | LIns* text_word = lir->insLoad(LIR_ld, pos, lir->insImm(0)); |
2163 | LIns* comp_word = useFastCI ? |
2164 | lir->ins2(LIR_or, text_word, lir->insImm(mask.i)) : |
2165 | text_word; |
2166 | fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_word, lir->insImm(word)), 0)); |
2167 | |
2168 | return lir->ins2(LIR_piadd, pos, lir->insImm(4)); |
2169 | } |
2170 | |
2171 | LIns* compileClass(RENode* node, LIns* pos, LInsList& fails) |
2172 | { |
2173 | if (!node->u.ucclass.sense) |
2174 | return JS_FALSE; |
2175 | /* |
2176 | * If we share generated native code, we need to make a copy |
2177 | * of the bitmap because the original regexp's copy is destroyed |
2178 | * when that regexp is. |
2179 | */ |
2180 | RECharSet *charSet = &re->classList[node->u.ucclass.index]; |
2181 | size_t bitmapLen = (charSet->length >> 3) + 1; |
2182 | /* skip() can't hold large data blocks. */ |
2183 | if (bitmapLen > 1024) |
2184 | return NULL; |
2185 | /* The following line allocates charSet.u.bits if successful. */ |
2186 | if (!charSet->converted && !ProcessCharSet(cx, re, charSet)) |
2187 | return NULL; |
2188 | LIns* skip = lirBufWriter->skip(bitmapLen); |
2189 | if (fragment->lirbuf->outOMem()) |
2190 | return NULL; |
2191 | void* bitmapData = skip->payload(); |
2192 | memcpy(bitmapData, charSet->u.bits, bitmapLen); |
2193 | |
2194 | LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0); |
2195 | fails.add(to_fail); |
2196 | LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0)); |
2197 | fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_le, text_ch, lir->insImm(charSet->length)), 0)); |
2198 | LIns* byteIndex = lir->ins2(LIR_rsh, text_ch, lir->insImm(3)); |
2199 | LIns* bitmap = lir->insImmPtr(bitmapData); |
2200 | LIns* byte = lir->insLoad(LIR_ldcb, lir->ins2(LIR_piadd, bitmap, byteIndex), (int) 0); |
2201 | LIns* bitMask = lir->ins2(LIR_lsh, lir->insImm(1), |
2202 | lir->ins2(LIR_and, text_ch, lir->insImm(0x7))); |
2203 | LIns* test = lir->ins2(LIR_eq, lir->ins2(LIR_and, byte, bitMask), lir->insImm(0)); |
2204 | |
2205 | LIns* to_next = lir->insBranch(LIR_jt, test, 0); |
2206 | fails.add(to_next); |
2207 | return lir->ins2(LIR_piadd, pos, lir->insImm(2)); |
2208 | } |
2209 | |
2210 | LIns* compileAlt(RENode* node, LIns* pos, LInsList& fails) |
2211 | { |
2212 | LInsList kidFails(NULL); |
2213 | if (!compileNode((RENode *) node->kid, pos, kidFails)) |
2214 | return NULL; |
2215 | if (!compileNode(node->next, pos, kidFails)) |
2216 | return NULL; |
2217 | |
2218 | if (!targetCurrentPoint(kidFails)) |
2219 | return NULL; |
2220 | if (!compileNode(node->u.altprereq.kid2, pos, fails)) |
2221 | return NULL; |
2222 | /* |
2223 | * Disable compilation for any regexp where something follows an |
2224 | * alternation. To make this work, we need to redesign to either |
2225 | * (a) pass around continuations so we know the right place to go |
2226 | * when we logically return, or (b) generate explicit backtracking |
2227 | * code. |
2228 | */ |
2229 | if (node->next) |
2230 | return NULL; |
2231 | return pos; |
2232 | } |
2233 | |
2234 | #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC) |
2235 | /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from |
2236 | * a pointer which is only 2-byte aligned. |
2237 | */ |
2238 | #undef USE_DOUBLE_CHAR_MATCH |
2239 | #else |
2240 | #define USE_DOUBLE_CHAR_MATCH |
2241 | #endif |
2242 | |
2243 | JSBool compileNode(RENode* node, LIns* pos, LInsList& fails) |
2244 | { |
2245 | for (; node; node = node->next) { |
2246 | if (fragment->lirbuf->outOMem()) |
2247 | return JS_FALSE; |
2248 | |
2249 | switch (node->op) { |
2250 | case REOP_EMPTY: |
2251 | pos = compileEmpty(node, pos, fails); |
2252 | break; |
2253 | case REOP_FLAT: |
2254 | #ifdef USE_DOUBLE_CHAR_MATCH |
2255 | if (node->u.flat.length == 1) { |
2256 | if (node->next && node->next->op == REOP_FLAT && |
2257 | node->next->u.flat.length == 1) { |
2258 | pos = compileFlatDoubleChar(node->u.flat.chr, |
2259 | node->next->u.flat.chr, |
2260 | pos, fails); |
2261 | node = node->next; |
2262 | } else { |
2263 | pos = compileFlatSingleChar(node->u.flat.chr, pos, fails); |
2264 | } |
2265 | } else { |
2266 | size_t i; |
2267 | for (i = 0; i < node->u.flat.length - 1; i += 2) { |
2268 | if (fragment->lirbuf->outOMem()) |
2269 | return JS_FALSE; |
2270 | pos = compileFlatDoubleChar(((jschar*) node->kid)[i], |
2271 | ((jschar*) node->kid)[i+1], |
2272 | pos, fails); |
2273 | if (!pos) break; |
2274 | } |
2275 | if (pos && i == node->u.flat.length - 1) |
2276 | pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails); |
2277 | } |
2278 | #else |
2279 | if (node->u.flat.length == 1) { |
2280 | pos = compileFlatSingleChar(node->u.flat.chr, pos, fails); |
2281 | } else { |
2282 | for (size_t i = 0; i < node->u.flat.length; i++) { |
2283 | if (fragment->lirbuf->outOMem()) |
2284 | return JS_FALSE; |
2285 | pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails); |
2286 | if (!pos) break; |
2287 | } |
2288 | } |
2289 | #endif |
2290 | break; |
2291 | case REOP_ALT: |
2292 | case REOP_ALTPREREQ: |
2293 | pos = compileAlt(node, pos, fails); |
2294 | break; |
2295 | case REOP_CLASS: |
2296 | pos = compileClass(node, pos, fails); |
2297 | break; |
2298 | default: |
2299 | return JS_FALSE; |
2300 | } |
2301 | if (!pos) |
2302 | return JS_FALSE; |
2303 | } |
2304 | |
2305 | lir->insStorei(pos, state, (int) offsetof(REMatchState, cp)); |
2306 | lir->ins1(LIR_ret, state); |
2307 | return JS_TRUE; |
2308 | } |
2309 | |
2310 | JSBool compileSticky(RENode* root, LIns* start) |
2311 | { |
2312 | LInsList fails(NULL); |
2313 | if (!compileNode(root, start, fails)) |
2314 | return JS_FALSE; |
2315 | if (!targetCurrentPoint(fails)) |
2316 | return JS_FALSE; |
2317 | lir->ins1(LIR_ret, lir->insImm(0)); |
2318 | return JS_TRUE; |
2319 | } |
2320 | |
2321 | JSBool compileAnchoring(RENode* root, LIns* start) |
2322 | { |
2323 | /* Even at the end, the empty regexp would match. */ |
2324 | LIns* to_next = lir->insBranch(LIR_jf, |
2325 | lir->ins2(LIR_le, start, cpend), 0); |
2326 | LInsList fails(NULL); |
2327 | if (!compileNode(root, start, fails)) |
2328 | return JS_FALSE; |
2329 | |
2330 | if (!targetCurrentPoint(to_next)) |
2331 | return JS_FALSE; |
2332 | lir->ins1(LIR_ret, lir->insImm(0)); |
2333 | |
2334 | if (!targetCurrentPoint(fails)) |
2335 | return JS_FALSE; |
2336 | lir->insStorei(lir->ins2(LIR_piadd, start, lir->insImm(2)), gdata, |
2337 | (int) offsetof(REGlobalData, skipped)); |
2338 | |
2339 | return JS_TRUE; |
2340 | } |
2341 | |
2342 | inline LIns* |
2343 | addName(LirBuffer* lirbuf, LIns* ins, const char* name) |
2344 | { |
2345 | #ifdef NJ_VERBOSE |
2346 | debug_only_v(lirbuf->names->addName(ins, name);) |
2347 | #endif |
2348 | return ins; |
2349 | } |
2350 | |
2351 | /* |
2352 | * Insert the side exit and guard record for a compiled regexp. Most |
2353 | * of the fields are not used. The important part is the regexp source |
2354 | * and flags, which we use as the fragment lookup key. |
2355 | */ |
2356 | GuardRecord* insertGuard(jschar* re_chars, size_t re_length) |
2357 | { |
2358 | LIns* skip = lirBufWriter->skip(sizeof(GuardRecord) + |
2359 | sizeof(RESideExit) + |
2360 | (re_length-1) * sizeof(jschar)); |
2361 | GuardRecord* guard = (GuardRecord *) skip->payload(); |
2362 | memset(guard, 0, sizeof(*guard)); |
2363 | RESideExit* exit = (RESideExit*)(guard+1); |
2364 | guard->exit = exit; |
2365 | guard->exit->target = fragment; |
2366 | exit->re_flags = re->flags; |
2367 | exit->re_length = re_length; |
2368 | memcpy(exit->re_chars, re_chars, re_length * sizeof(jschar)); |
2369 | fragment->lastIns = lir->insGuard(LIR_loop, lir->insImm(1), skip); |
2370 | return guard; |
2371 | } |
2372 | |
2373 | public: |
2374 | RegExpNativeCompiler(JSRegExp* re, CompilerState* cs, Fragment* fragment) |
2375 | : re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL) { } |
2376 | |
2377 | JSBool compile(JSContext* cx) |
2378 | { |
2379 | GuardRecord* guard = NULL; |
2380 | LIns* start; |
2381 | bool oom = false; |
2382 | jschar* re_chars; |
2383 | size_t re_length; |
2384 | Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento; |
2385 | |
2386 | JSSTRING_CHARS_AND_LENGTH(re->source, re_chars, re_length); |
2387 | /* |
2388 | * If the regexp is too long nanojit will assert when we |
2389 | * try to insert the guard record. |
2390 | */ |
2391 | if (re_length > 1024) |
2392 | return JS_FALSE; |
2393 | |
2394 | this->cx = cx; |
2395 | /* At this point we have an empty fragment. */ |
2396 | LirBuffer* lirbuf = fragment->lirbuf; |
2397 | if (lirbuf->outOMem()) |
2398 | goto fail; |
2399 | /* FIXME Use bug 463260 smart pointer when available. */ |
2400 | lir = lirBufWriter = new (&gc) LirBufWriter(lirbuf); |
2401 | |
2402 | /* FIXME Use bug 463260 smart pointer when available. */ |
2403 | #ifdef NJ_VERBOSE |
2404 | debug_only_v(fragment->lirbuf->names = new (&gc) LirNameMap(&gc, NULL, fragmento->labels);) |
2405 | #endif |
2406 | /* FIXME Use bug 463260 smart pointer when available. */ |
2407 | #ifdef NJ_VERBOSE |
2408 | debug_only_v(lir = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);) |
2409 | #endif |
2410 | |
2411 | lir->ins0(LIR_start); |
2412 | lirbuf->state = state = addName(lirbuf, lir->insParam(0, 0), "state"); |
2413 | lirbuf->param1 = gdata = addName(lirbuf, lir->insParam(1, 0), "gdata"); |
2414 | start = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, (int) offsetof(REGlobalData, skipped)), "start"); |
2415 | cpend = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, offsetof(REGlobalData, cpend)), "cpend"); |
2416 | |
2417 | if (cs->flags & JSREG_STICKY) { |
2418 | if (!compileSticky(cs->result, start)) |
2419 | goto fail; |
2420 | } else { |
2421 | if (!compileAnchoring(cs->result, start)) |
2422 | goto fail; |
2423 | } |
2424 | |
2425 | guard = insertGuard(re_chars, re_length); |
2426 | |
2427 | if (lirbuf->outOMem()) |
2428 | goto fail; |
2429 | ::compile(fragmento->assm(), fragment); |
2430 | if (fragmento->assm()->error() != nanojit::None) { |
2431 | oom = fragmento->assm()->error() == nanojit::OutOMem; |
2432 | goto fail; |
2433 | } |
2434 | |
2435 | delete lirBufWriter; |
2436 | #ifdef NJ_VERBOSE |
2437 | debug_only_v(delete lir;) |
2438 | #endif |
2439 | return JS_TRUE; |
2440 | fail: |
2441 | if (lirbuf->outOMem() || oom || |
2442 | js_OverfullFragmento(&JS_TRACE_MONITOR(cx), fragmento)) { |
2443 | fragmento->clearFrags(); |
2444 | lirbuf->rewind(); |
2445 | } else { |
2446 | if (!guard) insertGuard(re_chars, re_length); |
2447 | fragment->blacklist(); |
2448 | } |
2449 | delete lirBufWriter; |
2450 | #ifdef NJ_VERBOSE |
2451 | debug_only_v(delete lir;) |
2452 | #endif |
2453 | return JS_FALSE; |
2454 | } |
2455 | }; |
2456 | |
2457 | /* |
2458 | * Compile a regexp to native code in the given fragment. |
2459 | */ |
2460 | static inline JSBool |
2461 | CompileRegExpToNative(JSContext* cx, JSRegExp* re, Fragment* fragment) |
2462 | { |
2463 | JSBool rv = JS_FALSE; |
2464 | void* mark; |
2465 | CompilerState state; |
2466 | RegExpNativeCompiler rc(re, &state, fragment); |
2467 | |
2468 | JS_ASSERT(!fragment->code()); |
2469 | JS_ASSERT(!fragment->isBlacklisted()); |
2470 | |
2471 | mark = JS_ARENA_MARK(&cx->tempPool); |
2472 | if (!CompileRegExpToAST(cx, NULL, re->source, re->flags, state)) { |
2473 | goto out; |
2474 | } |
2475 | rv = rc.compile(cx); |
2476 | out: |
2477 | JS_ARENA_RELEASE(&cx->tempPool, mark); |
2478 | return rv; |
2479 | } |
2480 | |
2481 | /* Function type for a compiled native regexp. */ |
2482 | typedef REMatchState* (FASTCALL *NativeRegExp)(REMatchState*, REGlobalData*); |
2483 | |
2484 | /* |
2485 | * Return a compiled native regexp if one already exists or can be created |
2486 | * now, or NULL otherwise. |
2487 | */ |
2488 | static NativeRegExp |
2489 | GetNativeRegExp(JSContext* cx, JSRegExp* re) |
2490 | { |
2491 | Fragment *fragment; |
2492 | jschar* re_chars; |
2493 | size_t re_length; |
2494 | Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento; |
2495 | |
2496 | JSSTRING_CHARS_AND_LENGTH(re->source, re_chars, re_length); |
2497 | void* hash = HashRegExp(re->flags, re_chars, re_length); |
2498 | fragment = LookupNativeRegExp(cx, hash, re->flags, re_chars, re_length); |
2499 | if (fragment) { |
2500 | if (fragment->code()) |
2501 | goto ok; |
2502 | if (fragment->isBlacklisted()) |
2503 | return NULL; |
2504 | } else { |
2505 | fragment = fragmento->getAnchor(hash); |
2506 | fragment->lirbuf = JS_TRACE_MONITOR(cx).reLirBuf; |
2507 | fragment->root = fragment; |
2508 | } |
2509 | |
2510 | if (!CompileRegExpToNative(cx, re, fragment)) |
2511 | return NULL; |
2512 | ok: |
2513 | union { NIns *code; NativeRegExp func; } u; |
2514 | u.code = fragment->code(); |
2515 | return u.func; |
2516 | } |
2517 | #endif |
2518 | |
2519 | JSRegExp * |
2520 | js_NewRegExp(JSContext *cx, JSTokenStream *ts, |
2521 | JSString *str, uintN flags, JSBool flat) |
2522 | { |
2523 | JSRegExp *re; |
2524 | void *mark; |
2525 | CompilerState state; |
2526 | size_t resize; |
2527 | jsbytecode *endPC; |
2528 | uintN i; |
2529 | |
2530 | re = NULL; |
2531 | mark = JS_ARENA_MARK(&cx->tempPool); |
2532 | |
2533 | /* |
2534 | * Parsing the string as flat is now expressed internally using |
2535 | * a flag, so that we keep this information in the JSRegExp, but |
2536 | * we keep the 'flat' parameter for now for compatibility. |
2537 | */ |
2538 | if (flat) flags |= JSREG_FLAT; |
2539 | if (!CompileRegExpToAST(cx, ts, str, flags, state)) |
2540 | goto out; |
2541 | |
2542 | resize = offsetof(JSRegExp, program) + state.progLength + 1; |
2543 | re = (JSRegExp *) JS_malloc(cx, resize); |
2544 | if (!re) |
2545 | goto out; |
2546 | |
2547 | re->nrefs = 1; |
2548 | JS_ASSERT(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT); |
2549 | re->classCount = state.classCount; |
2550 | if (re->classCount) { |
2551 | re->classList = (RECharSet *) |
2552 | JS_malloc(cx, re->classCount * sizeof(RECharSet)); |
2553 | if (!re->classList) { |
2554 | js_DestroyRegExp(cx, re); |
2555 | re = NULL; |
2556 | goto out; |
2557 | } |
2558 | for (i = 0; i < re->classCount; i++) |
2559 | re->classList[i].converted = JS_FALSE; |
2560 | } else { |
2561 | re->classList = NULL; |
2562 | } |
2563 | |
2564 | /* Compile the bytecode version. */ |
2565 | endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result); |
2566 | if (!endPC) { |
2567 | js_DestroyRegExp(cx, re); |
2568 | re = NULL; |
2569 | goto out; |
2570 | } |
2571 | *endPC++ = REOP_END; |
2572 | /* |
2573 | * Check whether size was overestimated and shrink using realloc. |
2574 | * This is safe since no pointers to newly parsed regexp or its parts |
2575 | * besides re exist here. |
2576 | */ |
2577 | if ((size_t)(endPC - re->program) != state.progLength + 1) { |
2578 | JSRegExp *tmp; |
2579 | JS_ASSERT((size_t)(endPC - re->program) < state.progLength + 1); |
2580 | resize = offsetof(JSRegExp, program) + (endPC - re->program); |
2581 | tmp = (JSRegExp *) JS_realloc(cx, re, resize); |
2582 | if (tmp) |
2583 | re = tmp; |
2584 | } |
2585 | |
2586 | re->flags = flags; |
2587 | re->parenCount = state.parenCount; |
2588 | re->source = str; |
2589 | |
2590 | out: |
2591 | JS_ARENA_RELEASE(&cx->tempPool, mark); |
2592 | return re; |
2593 | } |
2594 | |
2595 | JSRegExp * |
2596 | js_NewRegExpOpt(JSContext *cx, JSString *str, JSString *opt, JSBool flat) |
2597 | { |
2598 | uintN flags; |
2599 | jschar *s; |
2600 | size_t i, n; |
2601 | char charBuf[2]; |
2602 | |
2603 | flags = 0; |
2604 | if (opt) { |
2605 | JSSTRING_CHARS_AND_LENGTH(opt, s, n); |
2606 | for (i = 0; i < n; i++) { |
2607 | #define HANDLE_FLAG(name) \ |
2608 | JS_BEGIN_MACRO \ |
2609 | if (flags & (name)) \ |
2610 | goto bad_flag; \ |
2611 | flags |= (name); \ |
2612 | JS_END_MACRO |
2613 | switch (s[i]) { |
2614 | case 'g': |
2615 | HANDLE_FLAG(JSREG_GLOB); |
2616 | break; |
2617 | case 'i': |
2618 | HANDLE_FLAG(JSREG_FOLD); |
2619 | break; |
2620 | case 'm': |
2621 | HANDLE_FLAG(JSREG_MULTILINE); |
2622 | break; |
2623 | case 'y': |
2624 | HANDLE_FLAG(JSREG_STICKY); |
2625 | break; |
2626 | default: |
2627 | bad_flag: |
2628 | charBuf[0] = (char)s[i]; |
2629 | charBuf[1] = '\0'; |
2630 | JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, |
2631 | js_GetErrorMessage, NULL, |
2632 | JSMSG_BAD_REGEXP_FLAG, charBuf); |
2633 | return NULL; |
2634 | } |
2635 | #undef HANDLE_FLAG |
2636 | } |
2637 | } |
2638 | return js_NewRegExp(cx, NULL, str, flags, flat); |
2639 | } |
2640 | |
2641 | /* |
2642 | * Save the current state of the match - the position in the input |
2643 | * text as well as the position in the bytecode. The state of any |
2644 | * parent expressions is also saved (preceding state). |
2645 | * Contents of parenCount parentheses from parenIndex are also saved. |
2646 | */ |
2647 | static REBackTrackData * |
2648 | PushBackTrackState(REGlobalData *gData, REOp op, |
2649 | jsbytecode *target, REMatchState *x, const jschar *cp, |
2650 | size_t parenIndex, size_t parenCount) |
2651 | { |
2652 | size_t i; |
2653 | REBackTrackData *result = |
2654 | (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz); |
2655 | |
2656 | size_t sz = sizeof(REBackTrackData) + |
2657 | gData->stateStackTop * sizeof(REProgState) + |
2658 | parenCount * sizeof(RECapture); |
2659 | |
2660 | ptrdiff_t btsize = gData->backTrackStackSize; |
2661 | ptrdiff_t btincr = ((char *)result + sz) - |
2662 | ((char *)gData->backTrackStack + btsize); |
2663 | |
2664 | re_debug("\tBT_Push: %lu,%lu", |
2665 | (unsigned long) parenIndex, (unsigned long) parenCount); |
2666 | |
2667 | if (btincr > 0) { |
2668 | ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack; |
2669 | |
2670 | btincr = JS_ROUNDUP(btincr, btsize); |
2671 | JS_ARENA_GROW_CAST(gData->backTrackStack, REBackTrackData *, |
2672 | &gData->cx->regexpPool, btsize, btincr); |
2673 | if (!gData->backTrackStack) { |
2674 | js_ReportOutOfScriptQuota(gData->cx); |
2675 | gData->ok = JS_FALSE; |
2676 | return NULL; |
2677 | } |
2678 | gData->backTrackStackSize = btsize + btincr; |
2679 | result = (REBackTrackData *) ((char *)gData->backTrackStack + offset); |
2680 | } |
2681 | gData->backTrackSP = result; |
2682 | result->sz = gData->cursz; |
2683 | gData->cursz = sz; |
2684 | |
2685 | result->backtrack_op = op; |
2686 | result->backtrack_pc = target; |
2687 | result->cp = cp; |
2688 | result->parenCount = parenCount; |
2689 | result->parenIndex = parenIndex; |
2690 | |
2691 | result->saveStateStackTop = gData->stateStackTop; |
2692 | JS_ASSERT(gData->stateStackTop); |
2693 | memcpy(result + 1, gData->stateStack, |
2694 | sizeof(REProgState) * result->saveStateStackTop); |
2695 | |
2696 | if (parenCount != 0) { |
2697 | memcpy((char *)(result + 1) + |
2698 | sizeof(REProgState) * result->saveStateStackTop, |
2699 | &x->parens[parenIndex], |
2700 | sizeof(RECapture) * parenCount); |
2701 | for (i = 0; i != parenCount; i++) |
2702 | x->parens[parenIndex + i].index = -1; |
2703 | } |
2704 | |
2705 | return result; |
2706 | } |
2707 | |
2708 | |
2709 | /* |
2710 | * Consecutive literal characters. |
2711 | */ |
2712 | #if 0 |
2713 | static REMatchState * |
2714 | FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars, |
2715 | size_t length) |
2716 | { |
2717 | size_t i; |
2718 | if (length > gData->cpend - x->cp) |
2719 | return NULL; |
2720 | for (i = 0; i != length; i++) { |
2721 | if (matchChars[i] != x->cp[i]) |
2722 | return NULL; |
2723 | } |
2724 | x->cp += length; |
2725 | return x; |
2726 | } |
2727 | #endif |
2728 | |
2729 | static JS_ALWAYS_INLINE REMatchState * |
2730 | FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars, |
2731 | size_t length) |
2732 | { |
2733 | size_t i; |
2734 | JS_ASSERT(gData->cpend >= x->cp); |
2735 | if (length > (size_t)(gData->cpend - x->cp)) |
2736 | return NULL; |
2737 | for (i = 0; i != length; i++) { |
2738 | if (upcase(matchChars[i]) != upcase(x->cp[i])) |
2739 | return NULL; |
2740 | } |
2741 | x->cp += length; |
2742 | return x; |
2743 | } |
2744 | |
2745 | /* |
2746 | * 1. Evaluate DecimalEscape to obtain an EscapeValue E. |
2747 | * 2. If E is not a character then go to step 6. |
2748 | * 3. Let ch be E's character. |
2749 | * 4. Let A be a one-element RECharSet containing the character ch. |
2750 | * 5. Call CharacterSetMatcher(A, false) and return its Matcher result. |
2751 | * 6. E must be an integer. Let n be that integer. |
2752 | * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. |
2753 | * 8. Return an internal Matcher closure that takes two arguments, a State x |
2754 | * and a Continuation c, and performs the following: |
2755 | * 1. Let cap be x's captures internal array. |
2756 | * 2. Let s be cap[n]. |
2757 | * 3. If s is undefined, then call c(x) and return its result. |
2758 | * 4. Let e be x's endIndex. |
2759 | * 5. Let len be s's length. |
2760 | * 6. Let f be e+len. |
2761 | * 7. If f>InputLength, return failure. |
2762 | * 8. If there exists an integer i between 0 (inclusive) and len (exclusive) |
2763 | * such that Canonicalize(s[i]) is not the same character as |
2764 | * Canonicalize(Input [e+i]), then return failure. |
2765 | * 9. Let y be the State (f, cap). |
2766 | * 10. Call c(y) and return its result. |
2767 | */ |
2768 | static REMatchState * |
2769 | BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex) |
2770 | { |
2771 | size_t len, i; |
2772 | const jschar *parenContent; |
2773 | RECapture *cap = &x->parens[parenIndex]; |
2774 | |
2775 | if (cap->index == -1) |
2776 | return x; |
2777 | |
2778 | len = cap->length; |
2779 | if (x->cp + len > gData->cpend) |
2780 | return NULL; |
2781 | |
2782 | parenContent = &gData->cpbegin[cap->index]; |
2783 | if (gData->regexp->flags & JSREG_FOLD) { |
2784 | for (i = 0; i < len; i++) { |
2785 | if (upcase(parenContent[i]) != upcase(x->cp[i])) |
2786 | return NULL; |
2787 | } |
2788 | } else { |
2789 | for (i = 0; i < len; i++) { |
2790 | if (parenContent[i] != x->cp[i]) |
2791 | return NULL; |
2792 | } |
2793 | } |
2794 | x->cp += len; |
2795 | return x; |
2796 | } |
2797 | |
2798 | |
2799 | /* Add a single character to the RECharSet */ |
2800 | static void |
2801 | AddCharacterToCharSet(RECharSet *cs, jschar c) |
2802 | { |
2803 | uintN byteIndex = (uintN)(c >> 3); |
2804 | JS_ASSERT(c <= cs->length); |
2805 | cs->u.bits[byteIndex] |= 1 << (c & 0x7); |
2806 | } |
2807 | |
2808 | |
2809 | /* Add a character range, c1 to c2 (inclusive) to the RECharSet */ |
2810 | static void |
2811 | AddCharacterRangeToCharSet(RECharSet *cs, uintN c1, uintN c2) |
2812 | { |
2813 | uintN i; |
2814 | |
2815 | uintN byteIndex1 = c1 >> 3; |
2816 | uintN byteIndex2 = c2 >> 3; |
2817 | |
2818 | JS_ASSERT(c2 <= cs->length && c1 <= c2); |
2819 | |
2820 | c1 &= 0x7; |
2821 | c2 &= 0x7; |
2822 | |
2823 | if (byteIndex1 == byteIndex2) { |
2824 | cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1; |
2825 | } else { |
2826 | cs->u.bits[byteIndex1] |= 0xFF << c1; |
2827 | for (i = byteIndex1 + 1; i < byteIndex2; i++) |
2828 | cs->u.bits[i] = 0xFF; |
2829 | cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2); |
2830 | } |
2831 | } |
2832 | |
2833 | struct CharacterRange { |
2834 | jschar start; |
2835 | jschar end; |
2836 | }; |
2837 | |
2838 | /* |
2839 | * The following characters are taken from the ECMA-262 standard, section 7.2 |
2840 | * and 7.3, and the Unicode 3 standard, Table 6-1. |
2841 | */ |
2842 | static const CharacterRange WhiteSpaceRanges[] = { |
2843 | /* TAB, LF, VT, FF, CR */ |
2844 | { 0x0009, 0x000D }, |
2845 | /* SPACE */ |
2846 | { 0x0020, 0x0020 }, |
2847 | /* NO-BREAK SPACE */ |
2848 | { 0x00A0, 0x00A0 }, |
2849 | /* |
2850 | * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM |
2851 | * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, |
2852 | * HAIR SPACE, ZERO WIDTH SPACE |
2853 | */ |
2854 | { 0x2000, 0x200B }, |
2855 | /* LS, PS */ |
2856 | { 0x2028, 0x2029 }, |
2857 | /* NARROW NO-BREAK SPACE */ |
2858 | { 0x202F, 0x202F }, |
2859 | /* IDEOGRAPHIC SPACE */ |
2860 | { 0x3000, 0x3000 } |
2861 | }; |
2862 | |
2863 | /* ECMA-262 standard, section 15.10.2.6. */ |
2864 | static const CharacterRange WordRanges[] = { |
2865 | { jschar('0'), jschar('9') }, |
2866 | { jschar('A'), jschar('Z') }, |
2867 | { jschar('_'), jschar('_') }, |
2868 | { jschar('a'), jschar('z') } |
2869 | }; |
2870 | |
2871 | static void |
2872 | AddCharacterRanges(RECharSet *charSet, |
2873 | const CharacterRange *range, |
2874 | const CharacterRange *end) |
2875 | { |
2876 | for (; range < end; ++range) |
2877 | AddCharacterRangeToCharSet(charSet, range->start, range->end); |
2878 | } |
2879 | |
2880 | static void |
2881 | AddInvertedCharacterRanges(RECharSet *charSet, |
2882 | const CharacterRange *range, |
2883 | const CharacterRange *end) |
2884 | { |
2885 | uint16 previous = 0; |
2886 | for (; range < end; ++range) { |
2887 | AddCharacterRangeToCharSet(charSet, previous, range->start - 1); |
2888 | previous = range->end + 1; |
2889 | } |
2890 | AddCharacterRangeToCharSet(charSet, previous, charSet->length); |
2891 | } |
2892 | |
2893 | /* Compile the source of the class into a RECharSet */ |
2894 | static JSBool |
2895 | ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet) |
2896 | { |
2897 | const jschar *src, *end; |
2898 | JSBool inRange = JS_FALSE; |
2899 | jschar rangeStart = 0; |
2900 | uintN byteLength, n; |
2901 | jschar c, thisCh; |
2902 | intN nDigits, i; |
2903 | |
2904 | JS_ASSERT(!charSet->converted); |
2905 | /* |
2906 | * Assert that startIndex and length points to chars inside [] inside |
2907 | * source string. |
2908 | */ |
2909 | JS_ASSERT(1 <= charSet->u.src.startIndex); |
2910 | JS_ASSERT(charSet->u.src.startIndex |
2911 | < JSSTRING_LENGTH(re->source)); |
2912 | JS_ASSERT(charSet->u.src.length <= JSSTRING_LENGTH(re->source) |
2913 | - 1 - charSet->u.src.startIndex); |
2914 | |
2915 | charSet->converted = JS_TRUE; |
2916 | src = JSSTRING_CHARS(re->source) + charSet->u.src.startIndex; |
2917 | end = src + charSet->u.src.length; |
2918 | JS_ASSERT(src[-1] == '['); |
2919 | JS_ASSERT(end[0] == ']'); |
2920 | |
2921 | byteLength = (charSet->length >> 3) + 1; |
2922 | charSet->u.bits = (uint8 *)JS_malloc(cx, byteLength); |
2923 | if (!charSet->u.bits) { |
2924 | JS_ReportOutOfMemory(cx); |
2925 | return JS_FALSE; |
2926 | } |
2927 | memset(charSet->u.bits, 0, byteLength); |
2928 | |
2929 | if (src == end) |
2930 | return JS_TRUE; |
2931 | |
2932 | if (*src == '^') { |
2933 | JS_ASSERT(charSet->sense == JS_FALSE); |
2934 | ++src; |
2935 | } else { |
2936 | JS_ASSERT(charSet->sense == JS_TRUE); |
2937 | } |
2938 | |
2939 | while (src != end) { |
2940 | switch (*src) { |
2941 | case '\\': |
2942 | ++src; |
2943 | c = *src++; |
2944 | switch (c) { |
2945 | case 'b': |
2946 | thisCh = 0x8; |
2947 | break; |
2948 | case 'f': |
2949 | thisCh = 0xC; |
2950 | break; |
2951 | case 'n': |
2952 | thisCh = 0xA; |
2953 | break; |
2954 | case 'r': |
2955 | thisCh = 0xD; |
2956 | break; |
2957 | case 't': |
2958 | thisCh = 0x9; |
2959 | break; |
2960 | case 'v': |
2961 | thisCh = 0xB; |
2962 | break; |
2963 | case 'c': |
2964 | if (src < end && JS_ISWORD(*src)) { |
2965 | thisCh = (jschar)(*src++ & 0x1F); |
2966 | } else { |
2967 | --src; |
2968 | thisCh = '\\'; |
2969 | } |
2970 | break; |
2971 | case 'x': |
2972 | nDigits = 2; |
2973 | goto lexHex; |
2974 | case 'u': |
2975 | nDigits = 4; |
2976 | lexHex: |
2977 | n = 0; |
2978 | for (i = 0; (i < nDigits) && (src < end); i++) { |
2979 | uintN digit; |
2980 | c = *src++; |
2981 | if (!isASCIIHexDigit(c, &digit)) { |
2982 | /* |
2983 | * Back off to accepting the original '\' |
2984 | * as a literal |
2985 | */ |
2986 | src -= i + 1; |
2987 | n = '\\'; |
2988 | break; |
2989 | } |
2990 | n = (n << 4) | digit; |
2991 | } |
2992 | thisCh = (jschar)n; |
2993 | break; |
2994 | case '0': |
2995 | case '1': |
2996 | case '2': |
2997 | case '3': |
2998 | case '4': |
2999 | case '5': |
3000 | case '6': |
3001 | case '7': |
3002 | /* |
3003 | * This is a non-ECMA extension - decimal escapes (in this |
3004 | * case, octal!) are supposed to be an error inside class |
3005 | * ranges, but supported here for backwards compatibility. |
3006 | */ |
3007 | n = JS7_UNDEC(c); |
3008 | c = *src; |
3009 | if ('0' <= c && c <= '7') { |
3010 | src++; |
3011 | n = 8 * n + JS7_UNDEC(c); |
3012 | c = *src; |
3013 | if ('0' <= c && c <= '7') { |
3014 | src++; |
3015 | i = 8 * n + JS7_UNDEC(c); |
3016 | if (i <= 0377) |
3017 | n = i; |
3018 | else |
3019 | src--; |
3020 | } |
3021 | } |
3022 | thisCh = (jschar)n; |
3023 | break; |
3024 | |
3025 | case 'd': |
3026 | AddCharacterRangeToCharSet(charSet, '0', '9'); |
3027 | continue; /* don't need range processing */ |
3028 | case 'D': |
3029 | AddCharacterRangeToCharSet(charSet, 0, '0' - 1); |
3030 | AddCharacterRangeToCharSet(charSet, |
3031 | (jschar)('9' + 1), |
3032 | (jschar)charSet->length); |
3033 | continue; |
3034 | case 's': |
3035 | AddCharacterRanges(charSet, WhiteSpaceRanges, |
3036 | WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges)); |
3037 | continue; |
3038 | case 'S': |
3039 | AddInvertedCharacterRanges(charSet, WhiteSpaceRanges, |
3040 | WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges)); |
3041 | continue; |
3042 | case 'w': |
3043 | AddCharacterRanges(charSet, WordRanges, |
3044 | WordRanges + JS_ARRAY_LENGTH(WordRanges)); |
3045 | continue; |
3046 | case 'W': |
3047 | AddInvertedCharacterRanges(charSet, WordRanges, |
3048 | WordRanges + JS_ARRAY_LENGTH(WordRanges)); |
3049 | continue; |
3050 | default: |
3051 | thisCh = c; |
3052 | break; |
3053 | |
3054 | } |
3055 | break; |
3056 | |
3057 | default: |
3058 | thisCh = *src++; |
3059 | break; |
3060 | |
3061 | } |
3062 | if (inRange) { |
3063 | if (re->flags & JSREG_FOLD) { |
3064 | int i; |
3065 | |
3066 | JS_ASSERT(rangeStart <= thisCh); |
3067 | for (i = rangeStart; i <= thisCh; i++) { |
3068 | jschar uch, dch; |
3069 | |
3070 | AddCharacterToCharSet(charSet, i); |
3071 | uch = upcase(i); |
3072 | dch = downcase(i); |
3073 | if (i != uch) |
3074 | AddCharacterToCharSet(charSet, uch); |
3075 | if (i != dch) |
3076 | AddCharacterToCharSet(charSet, dch); |
3077 | } |
3078 | } else { |
3079 | AddCharacterRangeToCharSet(charSet, rangeStart, thisCh); |
3080 | } |
3081 | inRange = JS_FALSE; |
3082 | } else { |
3083 | if (re->flags & JSREG_FOLD) { |
3084 | AddCharacterToCharSet(charSet, upcase(thisCh)); |
3085 | AddCharacterToCharSet(charSet, downcase(thisCh)); |
3086 | } else { |
3087 | AddCharacterToCharSet(charSet, thisCh); |
3088 | } |
3089 | if (src < end - 1) { |
3090 | if (*src == '-') { |
3091 | ++src; |
3092 | inRange = JS_TRUE; |
3093 | rangeStart = thisCh; |
3094 | } |
3095 | } |
3096 | } |
3097 | } |
3098 | return JS_TRUE; |
3099 | } |
3100 | |
3101 | static inline JSBool |
3102 | MatcherProcessCharSet(REGlobalData *gData, RECharSet *charSet) { |
3103 | JSBool rv = ProcessCharSet(gData->cx, gData->regexp, charSet); |
3104 | if (!rv) gData->ok = JS_FALSE; |
3105 | return rv; |
3106 | } |
3107 | |
3108 | void |
3109 | js_DestroyRegExp(JSContext *cx, JSRegExp *re) |
3110 | { |
3111 | if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) { |
3112 | if (re->classList) { |
3113 | uintN i; |
3114 | for (i = 0; i < re->classCount; i++) { |
3115 | if (re->classList[i].converted) |
3116 | JS_free(cx, re->classList[i].u.bits); |
3117 | re->classList[i].u.bits = NULL; |
3118 | } |
3119 | JS_free(cx, re->classList); |
3120 | } |
3121 | JS_free(cx, re); |
3122 | } |
3123 | } |
3124 | |
3125 | static JSBool |
3126 | ReallocStateStack(REGlobalData *gData) |
3127 | { |
3128 | size_t limit = gData->stateStackLimit; |
3129 | size_t sz = sizeof(REProgState) * limit; |
3130 | |
3131 | JS_ARENA_GROW_CAST(gData->stateStack, REProgState *, |
3132 | &gData->cx->regexpPool, sz, sz); |
3133 | if (!gData->stateStack) { |
3134 | js_ReportOutOfScriptQuota(gData->cx); |
3135 | gData->ok = JS_FALSE; |
3136 | return JS_FALSE; |
3137 | } |
3138 | gData->stateStackLimit = limit + limit; |
3139 | return JS_TRUE; |
3140 | } |
3141 | |
3142 | #define PUSH_STATE_STACK(data) \ |
3143 | JS_BEGIN_MACRO \ |
3144 | ++(data)->stateStackTop; \ |
3145 | if ((data)->stateStackTop == (data)->stateStackLimit && \ |
3146 | !ReallocStateStack((data))) { \ |
3147 | return NULL; \ |
3148 | } \ |
3149 | JS_END_MACRO |
3150 | |
3151 | /* |
3152 | * Apply the current op against the given input to see if it's going to match |
3153 | * or fail. Return false if we don't get a match, true if we do. If updatecp is |
3154 | * true, then update the current state's cp. Always update startpc to the next |
3155 | * op. |
3156 | */ |
3157 | static JS_ALWAYS_INLINE REMatchState * |
3158 | SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op, |
3159 | jsbytecode **startpc, JSBool updatecp) |
3160 | { |
3161 | REMatchState *result = NULL; |
3162 | jschar matchCh; |
3163 | size_t parenIndex; |
3164 | size_t offset, length, index; |
3165 | jsbytecode *pc = *startpc; /* pc has already been incremented past op */ |
3166 | jschar *source; |
3167 | const jschar *startcp = x->cp; |
3168 | jschar ch; |
3169 | RECharSet *charSet; |
3170 | |
3171 | #ifdef REGEXP_DEBUG |
3172 | const char *opname = reop_names[op]; |
3173 | re_debug("\n%06d: %*s%s", pc - gData->regexp->program, |
3174 | gData->stateStackTop * 2, "", opname); |
3175 | #endif |
3176 | switch (op) { |
3177 | case REOP_EMPTY: |
3178 | result = x; |
3179 | break; |
3180 | case REOP_BOL: |
3181 | if (x->cp != gData->cpbegin) { |
3182 | if (!gData->cx->regExpStatics.multiline && |
3183 | !(gData->regexp->flags & JSREG_MULTILINE)) { |
3184 | break; |
3185 | } |
3186 | if (!RE_IS_LINE_TERM(x->cp[-1])) |
3187 | break; |
3188 | } |
3189 | result = x; |
3190 | break; |
3191 | case REOP_EOL: |
3192 | if (x->cp != gData->cpend) { |
3193 | if (!gData->cx->regExpStatics.multiline && |
3194 | !(gData->regexp->flags & JSREG_MULTILINE)) { |
3195 | break; |
3196 | } |
3197 | if (!RE_IS_LINE_TERM(*x->cp)) |
3198 | break; |
3199 | } |
3200 | result = x; |
3201 | break; |
3202 | case REOP_WBDRY: |
3203 | if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^ |
3204 | !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) { |
3205 | result = x; |
3206 | } |
3207 | break; |
3208 | case REOP_WNONBDRY: |
3209 | if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^ |
3210 | (x->cp != gData->cpend && JS_ISWORD(*x->cp))) { |
3211 | result = x; |
3212 | } |
3213 | break; |
3214 | case REOP_DOT: |
3215 | if (x->cp != gData->cpend && !RE_IS_LINE_TERM( |