/[jscoverage]/trunk/js/jsregexp.cpp
ViewVC logotype

Contents of /trunk/js/jsregexp.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 399 - (show annotations)
Tue Dec 9 03:37:47 2008 UTC (11 years, 1 month ago) by siliconforks
File size: 159806 byte(s)
Use SpiderMonkey from Firefox 3.1b2.

1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set sw=4 ts=8 et tw=78:
3 *
4 * ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 *
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
11 *
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
15 * License.
16 *
17 * The Original Code is Mozilla Communicator client code, released
18 * March 31, 1998.
19 *
20 * The Initial Developer of the Original Code is
21 * Netscape Communications Corporation.
22 * Portions created by the Initial Developer are Copyright (C) 1998
23 * the Initial Developer. All Rights Reserved.
24 *
25 * Contributor(s):
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either of the GNU General Public License Version 2 or later (the "GPL"),
29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
41 /*
42 * JS regular expressions, after Perl.
43 */
44 #include "jsstddef.h"
45 #include <stdlib.h>
46 #include <string.h>
47 #include <stdarg.h>
48 #include "jstypes.h"
49 #include "jsarena.h" /* Added by JSIFY */
50 #include "jsutil.h" /* Added by JSIFY */
51 #include "jsapi.h"
52 #include "jsarray.h"
53 #include "jsatom.h"
54 #include "jsbuiltins.h"
55 #include "jscntxt.h"
56 #include "jsversion.h"
57 #include "jsfun.h"
58 #include "jsgc.h"
59 #include "jsinterp.h"
60 #include "jslock.h"
61 #include "jsnum.h"
62 #include "jsobj.h"
63 #include "jsopcode.h"
64 #include "jsregexp.h"
65 #include "jsscan.h"
66 #include "jsscope.h"
67 #include "jsstr.h"
68
69 #ifdef JS_TRACER
70 #include "jstracer.h"
71 using namespace avmplus;
72 using namespace nanojit;
73
74 /*
75 * FIXME Duplicated with jstracer.cpp, doing it this way for now
76 * to keep it private to files that need it.
77 */
78 #ifdef JS_JIT_SPEW
79 static bool verbose_debug = getenv("TRACEMONKEY") && strstr(getenv("TRACEMONKEY"), "verbose");
80 #define debug_only_v(x) if (verbose_debug) { x; }
81 #else
82 #define debug_only_v(x)
83 #endif
84 #endif
85
86 typedef enum REOp {
87 #define REOP_DEF(opcode, name) opcode,
88 #include "jsreops.tbl"
89 #undef REOP_DEF
90 REOP_LIMIT /* META: no operator >= to this */
91 } REOp;
92
93 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
94
95 #ifdef REGEXP_DEBUG
96 const char *reop_names[] = {
97 #define REOP_DEF(opcode, name) name,
98 #include "jsreops.tbl"
99 #undef REOP_DEF
100 NULL
101 };
102 #endif
103
104 #ifdef __GNUC__
105 static int
106 re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2)));
107 #endif
108
109 #ifdef REGEXP_DEBUG
110 static int
111 re_debug(const char *fmt, ...)
112 {
113 va_list ap;
114 int retval;
115
116 va_start(ap, fmt);
117 retval = vprintf(fmt, ap);
118 va_end(ap);
119 return retval;
120 }
121
122 static void
123 re_debug_chars(const jschar *chrs, size_t length)
124 {
125 int i = 0;
126
127 printf(" \"");
128 while (*chrs && i++ < length) {
129 putchar((char)*chrs++);
130 }
131 printf("\"");
132 }
133 #else /* !REGEXP_DEBUG */
134 /* This should be optimized to a no-op by our tier-1 compilers. */
135 static int
136 re_debug(const char *fmt, ...)
137 {
138 return 0;
139 }
140
141 static void
142 re_debug_chars(const jschar *chrs, size_t length)
143 {
144 }
145 #endif /* !REGEXP_DEBUG */
146
147 struct RENode {
148 REOp op; /* r.e. op bytecode */
149 RENode *next; /* next in concatenation order */
150 void *kid; /* first operand */
151 union {
152 void *kid2; /* second operand */
153 jsint num; /* could be a number */
154 size_t parenIndex; /* or a parenthesis index */
155 struct { /* or a quantifier range */
156 uintN min;
157 uintN max;
158 JSPackedBool greedy;
159 } range;
160 struct { /* or a character class */
161 size_t startIndex;
162 size_t kidlen; /* length of string at kid, in jschars */
163 size_t index; /* index into class list */
164 uint16 bmsize; /* bitmap size, based on max char code */
165 JSPackedBool sense;
166 } ucclass;
167 struct { /* or a literal sequence */
168 jschar chr; /* of one character */
169 size_t length; /* or many (via the kid) */
170 } flat;
171 struct {
172 RENode *kid2; /* second operand from ALT */
173 jschar ch1; /* match char for ALTPREREQ */
174 jschar ch2; /* ditto, or class index for ALTPREREQ2 */
175 } altprereq;
176 } u;
177 };
178
179 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
180 ((c >= 'a') && (c <= 'z')) )
181 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
182 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
183
184 #define CLASS_CACHE_SIZE 4
185
186 typedef struct CompilerState {
187 JSContext *context;
188 JSTokenStream *tokenStream; /* For reporting errors */
189 const jschar *cpbegin;
190 const jschar *cpend;
191 const jschar *cp;
192 size_t parenCount;
193 size_t classCount; /* number of [] encountered */
194 size_t treeDepth; /* maximum depth of parse tree */
195 size_t progLength; /* estimated bytecode length */
196 RENode *result;
197 size_t classBitmapsMem; /* memory to hold all class bitmaps */
198 struct {
199 const jschar *start; /* small cache of class strings */
200 size_t length; /* since they're often the same */
201 size_t index;
202 } classCache[CLASS_CACHE_SIZE];
203 uint16 flags;
204 } CompilerState;
205
206 typedef struct EmitStateStackEntry {
207 jsbytecode *altHead; /* start of REOP_ALT* opcode */
208 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
209 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
210 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
211 RENode *continueNode; /* original REOP_ALT* node being stacked */
212 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
213 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
214 avoid 16-bit unsigned offset overflow */
215 } EmitStateStackEntry;
216
217 /*
218 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
219 * the getters and setters take the pc of the offset, not of the opcode before
220 * the offset.
221 */
222 #define ARG_LEN 2
223 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
224 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
225 (pc)[1] = (jsbytecode) (arg))
226
227 #define OFFSET_LEN ARG_LEN
228 #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
229 #define GET_OFFSET(pc) GET_ARG(pc)
230
231 /*
232 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
233 * For sanity, we limit it to 2^24 bytes.
234 */
235 #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
236
237 /*
238 * The maximum memory that can be allocated for class bitmaps.
239 * For sanity, we limit it to 2^24 bytes.
240 */
241 #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
242
243 /*
244 * Functions to get size and write/read bytecode that represent small indexes
245 * compactly.
246 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
247 * indicates that the following byte brings more bits to the index. Otherwise
248 * this is the last byte in the index bytecode representing highest index bits.
249 */
250 static size_t
251 GetCompactIndexWidth(size_t index)
252 {
253 size_t width;
254
255 for (width = 1; (index >>= 7) != 0; ++width) { }
256 return width;
257 }
258
259 static JS_ALWAYS_INLINE jsbytecode *
260 WriteCompactIndex(jsbytecode *pc, size_t index)
261 {
262 size_t next;
263
264 while ((next = index >> 7) != 0) {
265 *pc++ = (jsbytecode)(index | 0x80);
266 index = next;
267 }
268 *pc++ = (jsbytecode)index;
269 return pc;
270 }
271
272 static JS_ALWAYS_INLINE jsbytecode *
273 ReadCompactIndex(jsbytecode *pc, size_t *result)
274 {
275 size_t nextByte;
276
277 nextByte = *pc++;
278 if ((nextByte & 0x80) == 0) {
279 /*
280 * Short-circuit the most common case when compact index <= 127.
281 */
282 *result = nextByte;
283 } else {
284 size_t shift = 7;
285 *result = 0x7F & nextByte;
286 do {
287 nextByte = *pc++;
288 *result |= (nextByte & 0x7F) << shift;
289 shift += 7;
290 } while ((nextByte & 0x80) != 0);
291 }
292 return pc;
293 }
294
295 typedef struct RECapture {
296 ptrdiff_t index; /* start of contents, -1 for empty */
297 size_t length; /* length of capture */
298 } RECapture;
299
300 typedef struct REMatchState {
301 const jschar *cp;
302 RECapture parens[1]; /* first of 're->parenCount' captures,
303 allocated at end of this struct */
304 } REMatchState;
305
306 struct REBackTrackData;
307
308 typedef struct REProgState {
309 jsbytecode *continue_pc; /* current continuation data */
310 jsbytecode continue_op;
311 ptrdiff_t index; /* progress in text */
312 size_t parenSoFar; /* highest indexed paren started */
313 union {
314 struct {
315 uintN min; /* current quantifier limits */
316 uintN max;
317 } quantifier;
318 struct {
319 size_t top; /* backtrack stack state */
320 size_t sz;
321 } assertion;
322 } u;
323 } REProgState;
324
325 typedef struct REBackTrackData {
326 size_t sz; /* size of previous stack entry */
327 jsbytecode *backtrack_pc; /* where to backtrack to */
328 jsbytecode backtrack_op;
329 const jschar *cp; /* index in text of match at backtrack */
330 size_t parenIndex; /* start index of saved paren contents */
331 size_t parenCount; /* # of saved paren contents */
332 size_t saveStateStackTop; /* number of parent states */
333 /* saved parent states follow */
334 /* saved paren contents follow */
335 } REBackTrackData;
336
337 #define INITIAL_STATESTACK 100
338 #define INITIAL_BACKTRACK 8000
339
340 typedef struct REGlobalData {
341 JSContext *cx;
342 JSRegExp *regexp; /* the RE in execution */
343 JSBool ok; /* runtime error (out_of_memory only?) */
344 size_t start; /* offset to start at */
345 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
346 const jschar *cpbegin; /* text base address */
347 const jschar *cpend; /* text limit address */
348
349 REProgState *stateStack; /* stack of state of current parents */
350 size_t stateStackTop;
351 size_t stateStackLimit;
352
353 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
354 REBackTrackData *backTrackSP;
355 size_t backTrackStackSize;
356 size_t cursz; /* size of current stack entry */
357 size_t backTrackCount; /* how many times we've backtracked */
358 size_t backTrackLimit; /* upper limit on backtrack states */
359 } REGlobalData;
360
361 /*
362 * 1. If IgnoreCase is false, return ch.
363 * 2. Let u be ch converted to upper case as if by calling
364 * String.prototype.toUpperCase on the one-character string ch.
365 * 3. If u does not consist of a single character, return ch.
366 * 4. Let cu be u's character.
367 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
368 * code point value is less than decimal 128, then return ch.
369 * 6. Return cu.
370 */
371 static JS_ALWAYS_INLINE uintN
372 upcase(uintN ch)
373 {
374 uintN cu;
375
376 JS_ASSERT((uintN) (jschar) ch == ch);
377 if (ch < 128) {
378 if (ch - (uintN) 'a' <= (uintN) ('z' - 'a'))
379 ch -= (uintN) ('a' - 'A');
380 return ch;
381 }
382
383 cu = JS_TOUPPER(ch);
384 return (cu < 128) ? ch : cu;
385 }
386
387 static JS_ALWAYS_INLINE uintN
388 downcase(uintN ch)
389 {
390 JS_ASSERT((uintN) (jschar) ch == ch);
391 if (ch < 128) {
392 if (ch - (uintN) 'A' <= (uintN) ('Z' - 'A'))
393 ch += (uintN) ('a' - 'A');
394 return ch;
395 }
396
397 return JS_TOLOWER(ch);
398 }
399
400 /* Construct and initialize an RENode, returning NULL for out-of-memory */
401 static RENode *
402 NewRENode(CompilerState *state, REOp op)
403 {
404 JSContext *cx;
405 RENode *ren;
406
407 cx = state->context;
408 JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
409 if (!ren) {
410 js_ReportOutOfScriptQuota(cx);
411 return NULL;
412 }
413 ren->op = op;
414 ren->next = NULL;
415 ren->kid = NULL;
416 return ren;
417 }
418
419 /*
420 * Validates and converts hex ascii value.
421 */
422 static JSBool
423 isASCIIHexDigit(jschar c, uintN *digit)
424 {
425 uintN cv = c;
426
427 if (cv < '0')
428 return JS_FALSE;
429 if (cv <= '9') {
430 *digit = cv - '0';
431 return JS_TRUE;
432 }
433 cv |= 0x20;
434 if (cv >= 'a' && cv <= 'f') {
435 *digit = cv - 'a' + 10;
436 return JS_TRUE;
437 }
438 return JS_FALSE;
439 }
440
441
442 typedef struct {
443 REOp op;
444 const jschar *errPos;
445 size_t parenIndex;
446 } REOpData;
447
448 static JSBool
449 ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber,
450 const jschar *arg)
451 {
452 if (state->tokenStream) {
453 return js_ReportCompileErrorNumber(state->context, state->tokenStream,
454 NULL, JSREPORT_UC | flags,
455 errorNumber, arg);
456 }
457 return JS_ReportErrorFlagsAndNumberUC(state->context, flags,
458 js_GetErrorMessage, NULL,
459 errorNumber, arg);
460 }
461
462 static JSBool
463 ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber)
464 {
465 return ReportRegExpErrorHelper(state, flags, errorNumber, NULL);
466 }
467
468 /*
469 * Process the op against the two top operands, reducing them to a single
470 * operand in the penultimate slot. Update progLength and treeDepth.
471 */
472 static JSBool
473 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
474 intN operandSP)
475 {
476 RENode *result;
477
478 switch (opData->op) {
479 case REOP_ALT:
480 result = NewRENode(state, REOP_ALT);
481 if (!result)
482 return JS_FALSE;
483 result->kid = operandStack[operandSP - 2];
484 result->u.kid2 = operandStack[operandSP - 1];
485 operandStack[operandSP - 2] = result;
486
487 if (state->treeDepth == TREE_DEPTH_MAX) {
488 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
489 return JS_FALSE;
490 }
491 ++state->treeDepth;
492
493 /*
494 * Look at both alternates to see if there's a FLAT or a CLASS at
495 * the start of each. If so, use a prerequisite match.
496 */
497 if (((RENode *) result->kid)->op == REOP_FLAT &&
498 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
499 (state->flags & JSREG_FOLD) == 0) {
500 result->op = REOP_ALTPREREQ;
501 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
502 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
503 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
504 JUMP, <end> ... ENDALT */
505 state->progLength += 13;
506 }
507 else
508 if (((RENode *) result->kid)->op == REOP_CLASS &&
509 ((RENode *) result->kid)->u.ucclass.index < 256 &&
510 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
511 (state->flags & JSREG_FOLD) == 0) {
512 result->op = REOP_ALTPREREQ2;
513 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
514 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
515 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
516 JUMP, <end> ... ENDALT */
517 state->progLength += 13;
518 }
519 else
520 if (((RENode *) result->kid)->op == REOP_FLAT &&
521 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
522 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
523 (state->flags & JSREG_FOLD) == 0) {
524 result->op = REOP_ALTPREREQ2;
525 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
526 result->u.altprereq.ch2 =
527 ((RENode *) result->u.kid2)->u.ucclass.index;
528 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
529 JUMP, <end> ... ENDALT */
530 state->progLength += 13;
531 }
532 else {
533 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
534 state->progLength += 7;
535 }
536 break;
537
538 case REOP_CONCAT:
539 result = operandStack[operandSP - 2];
540 while (result->next)
541 result = result->next;
542 result->next = operandStack[operandSP - 1];
543 break;
544
545 case REOP_ASSERT:
546 case REOP_ASSERT_NOT:
547 case REOP_LPARENNON:
548 case REOP_LPAREN:
549 /* These should have been processed by a close paren. */
550 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
551 opData->errPos);
552 return JS_FALSE;
553
554 default:;
555 }
556 return JS_TRUE;
557 }
558
559 /*
560 * Parser forward declarations.
561 */
562 static JSBool ParseTerm(CompilerState *state);
563 static JSBool ParseQuantifier(CompilerState *state);
564 static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues);
565
566 /*
567 * Top-down regular expression grammar, based closely on Perl4.
568 *
569 * regexp: altern A regular expression is one or more
570 * altern '|' regexp alternatives separated by vertical bar.
571 */
572 #define INITIAL_STACK_SIZE 128
573
574 static JSBool
575 ParseRegExp(CompilerState *state)
576 {
577 size_t parenIndex;
578 RENode *operand;
579 REOpData *operatorStack;
580 RENode **operandStack;
581 REOp op;
582 intN i;
583 JSBool result = JS_FALSE;
584
585 intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
586 intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
587
588 /* Watch out for empty regexp */
589 if (state->cp == state->cpend) {
590 state->result = NewRENode(state, REOP_EMPTY);
591 return (state->result != NULL);
592 }
593
594 operatorStack = (REOpData *)
595 JS_malloc(state->context, sizeof(REOpData) * operatorStackSize);
596 if (!operatorStack)
597 return JS_FALSE;
598
599 operandStack = (RENode **)
600 JS_malloc(state->context, sizeof(RENode *) * operandStackSize);
601 if (!operandStack)
602 goto out;
603
604 for (;;) {
605 parenIndex = state->parenCount;
606 if (state->cp == state->cpend) {
607 /*
608 * If we are at the end of the regexp and we're short one or more
609 * operands, the regexp must have the form /x|/ or some such, with
610 * left parentheses making us short more than one operand.
611 */
612 if (operatorSP >= operandSP) {
613 operand = NewRENode(state, REOP_EMPTY);
614 if (!operand)
615 goto out;
616 goto pushOperand;
617 }
618 } else {
619 switch (*state->cp) {
620 case '(':
621 ++state->cp;
622 if (state->cp + 1 < state->cpend &&
623 *state->cp == '?' &&
624 (state->cp[1] == '=' ||
625 state->cp[1] == '!' ||
626 state->cp[1] == ':')) {
627 switch (state->cp[1]) {
628 case '=':
629 op = REOP_ASSERT;
630 /* ASSERT, <next>, ... ASSERTTEST */
631 state->progLength += 4;
632 break;
633 case '!':
634 op = REOP_ASSERT_NOT;
635 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
636 state->progLength += 4;
637 break;
638 default:
639 op = REOP_LPARENNON;
640 break;
641 }
642 state->cp += 2;
643 } else {
644 op = REOP_LPAREN;
645 /* LPAREN, <index>, ... RPAREN, <index> */
646 state->progLength
647 += 2 * (1 + GetCompactIndexWidth(parenIndex));
648 state->parenCount++;
649 if (state->parenCount == 65535) {
650 ReportRegExpError(state, JSREPORT_ERROR,
651 JSMSG_TOO_MANY_PARENS);
652 goto out;
653 }
654 }
655 goto pushOperator;
656
657 case ')':
658 /*
659 * If there's no stacked open parenthesis, throw syntax error.
660 */
661 for (i = operatorSP - 1; ; i--) {
662 if (i < 0) {
663 ReportRegExpError(state, JSREPORT_ERROR,
664 JSMSG_UNMATCHED_RIGHT_PAREN);
665 goto out;
666 }
667 if (operatorStack[i].op == REOP_ASSERT ||
668 operatorStack[i].op == REOP_ASSERT_NOT ||
669 operatorStack[i].op == REOP_LPARENNON ||
670 operatorStack[i].op == REOP_LPAREN) {
671 break;
672 }
673 }
674 /* FALL THROUGH */
675
676 case '|':
677 /* Expected an operand before these, so make an empty one */
678 operand = NewRENode(state, REOP_EMPTY);
679 if (!operand)
680 goto out;
681 goto pushOperand;
682
683 default:
684 if (!ParseTerm(state))
685 goto out;
686 operand = state->result;
687 pushOperand:
688 if (operandSP == operandStackSize) {
689 RENode **tmp;
690 operandStackSize += operandStackSize;
691 tmp = (RENode **)
692 JS_realloc(state->context, operandStack,
693 sizeof(RENode *) * operandStackSize);
694 if (!tmp)
695 goto out;
696 operandStack = tmp;
697 }
698 operandStack[operandSP++] = operand;
699 break;
700 }
701 }
702
703 /* At the end; process remaining operators. */
704 restartOperator:
705 if (state->cp == state->cpend) {
706 while (operatorSP) {
707 --operatorSP;
708 if (!ProcessOp(state, &operatorStack[operatorSP],
709 operandStack, operandSP))
710 goto out;
711 --operandSP;
712 }
713 JS_ASSERT(operandSP == 1);
714 state->result = operandStack[0];
715 result = JS_TRUE;
716 goto out;
717 }
718
719 switch (*state->cp) {
720 case '|':
721 /* Process any stacked 'concat' operators */
722 ++state->cp;
723 while (operatorSP &&
724 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
725 --operatorSP;
726 if (!ProcessOp(state, &operatorStack[operatorSP],
727 operandStack, operandSP)) {
728 goto out;
729 }
730 --operandSP;
731 }
732 op = REOP_ALT;
733 goto pushOperator;
734
735 case ')':
736 /*
737 * If there's no stacked open parenthesis, throw syntax error.
738 */
739 for (i = operatorSP - 1; ; i--) {
740 if (i < 0) {
741 ReportRegExpError(state, JSREPORT_ERROR,
742 JSMSG_UNMATCHED_RIGHT_PAREN);
743 goto out;
744 }
745 if (operatorStack[i].op == REOP_ASSERT ||
746 operatorStack[i].op == REOP_ASSERT_NOT ||
747 operatorStack[i].op == REOP_LPARENNON ||
748 operatorStack[i].op == REOP_LPAREN) {
749 break;
750 }
751 }
752 ++state->cp;
753
754 /* Process everything on the stack until the open parenthesis. */
755 for (;;) {
756 JS_ASSERT(operatorSP);
757 --operatorSP;
758 switch (operatorStack[operatorSP].op) {
759 case REOP_ASSERT:
760 case REOP_ASSERT_NOT:
761 case REOP_LPAREN:
762 operand = NewRENode(state, operatorStack[operatorSP].op);
763 if (!operand)
764 goto out;
765 operand->u.parenIndex =
766 operatorStack[operatorSP].parenIndex;
767 JS_ASSERT(operandSP);
768 operand->kid = operandStack[operandSP - 1];
769 operandStack[operandSP - 1] = operand;
770 if (state->treeDepth == TREE_DEPTH_MAX) {
771 ReportRegExpError(state, JSREPORT_ERROR,
772 JSMSG_REGEXP_TOO_COMPLEX);
773 goto out;
774 }
775 ++state->treeDepth;
776 /* FALL THROUGH */
777
778 case REOP_LPARENNON:
779 state->result = operandStack[operandSP - 1];
780 if (!ParseQuantifier(state))
781 goto out;
782 operandStack[operandSP - 1] = state->result;
783 goto restartOperator;
784 default:
785 if (!ProcessOp(state, &operatorStack[operatorSP],
786 operandStack, operandSP))
787 goto out;
788 --operandSP;
789 break;
790 }
791 }
792 break;
793
794 case '{':
795 {
796 const jschar *errp = state->cp;
797
798 if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) {
799 /*
800 * This didn't even scan correctly as a quantifier, so we should
801 * treat it as flat.
802 */
803 op = REOP_CONCAT;
804 goto pushOperator;
805 }
806
807 state->cp = errp;
808 /* FALL THROUGH */
809 }
810
811 case '+':
812 case '*':
813 case '?':
814 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
815 state->cp);
816 result = JS_FALSE;
817 goto out;
818
819 default:
820 /* Anything else is the start of the next term. */
821 op = REOP_CONCAT;
822 pushOperator:
823 if (operatorSP == operatorStackSize) {
824 REOpData *tmp;
825 operatorStackSize += operatorStackSize;
826 tmp = (REOpData *)
827 JS_realloc(state->context, operatorStack,
828 sizeof(REOpData) * operatorStackSize);
829 if (!tmp)
830 goto out;
831 operatorStack = tmp;
832 }
833 operatorStack[operatorSP].op = op;
834 operatorStack[operatorSP].errPos = state->cp;
835 operatorStack[operatorSP++].parenIndex = parenIndex;
836 break;
837 }
838 }
839 out:
840 if (operatorStack)
841 JS_free(state->context, operatorStack);
842 if (operandStack)
843 JS_free(state->context, operandStack);
844 return result;
845 }
846
847 /*
848 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
849 * its being on the stack, and to propagate errors to its callers.
850 */
851 #define JSREG_FIND_PAREN_COUNT 0x8000
852 #define JSREG_FIND_PAREN_ERROR 0x4000
853
854 /*
855 * Magic return value from FindParenCount and GetDecimalValue, to indicate
856 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
857 * its findMax parameter is non-null.
858 */
859 #define OVERFLOW_VALUE ((uintN)-1)
860
861 static uintN
862 FindParenCount(CompilerState *state)
863 {
864 CompilerState temp;
865 int i;
866
867 if (state->flags & JSREG_FIND_PAREN_COUNT)
868 return OVERFLOW_VALUE;
869
870 /*
871 * Copy state into temp, flag it so we never report an invalid backref,
872 * and reset its members to parse the entire regexp. This is obviously
873 * suboptimal, but GetDecimalValue calls us only if a backref appears to
874 * refer to a forward parenthetical, which is rare.
875 */
876 temp = *state;
877 temp.flags |= JSREG_FIND_PAREN_COUNT;
878 temp.cp = temp.cpbegin;
879 temp.parenCount = 0;
880 temp.classCount = 0;
881 temp.progLength = 0;
882 temp.treeDepth = 0;
883 temp.classBitmapsMem = 0;
884 for (i = 0; i < CLASS_CACHE_SIZE; i++)
885 temp.classCache[i].start = NULL;
886
887 if (!ParseRegExp(&temp)) {
888 state->flags |= JSREG_FIND_PAREN_ERROR;
889 return OVERFLOW_VALUE;
890 }
891 return temp.parenCount;
892 }
893
894 /*
895 * Extract and return a decimal value at state->cp. The initial character c
896 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
897 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
898 * state->flags to discover whether an error occurred under findMax.
899 */
900 static uintN
901 GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
902 CompilerState *state)
903 {
904 uintN value = JS7_UNDEC(c);
905 JSBool overflow = (value > max && (!findMax || value > findMax(state)));
906
907 /* The following restriction allows simpler overflow checks. */
908 JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
909 while (state->cp < state->cpend) {
910 c = *state->cp;
911 if (!JS7_ISDEC(c))
912 break;
913 value = 10 * value + JS7_UNDEC(c);
914 if (!overflow && value > max && (!findMax || value > findMax(state)))
915 overflow = JS_TRUE;
916 ++state->cp;
917 }
918 return overflow ? OVERFLOW_VALUE : value;
919 }
920
921 /*
922 * Calculate the total size of the bitmap required for a class expression.
923 */
924 static JSBool
925 CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
926 const jschar *end)
927 {
928 uintN max = 0;
929 JSBool inRange = JS_FALSE;
930 jschar c, rangeStart = 0;
931 uintN n, digit, nDigits, i;
932
933 target->u.ucclass.bmsize = 0;
934 target->u.ucclass.sense = JS_TRUE;
935
936 if (src == end)
937 return JS_TRUE;
938
939 if (*src == '^') {
940 ++src;
941 target->u.ucclass.sense = JS_FALSE;
942 }
943
944 while (src != end) {
945 JSBool canStartRange = JS_TRUE;
946 uintN localMax = 0;
947
948 switch (*src) {
949 case '\\':
950 ++src;
951 c = *src++;
952 switch (c) {
953 case 'b':
954 localMax = 0x8;
955 break;
956 case 'f':
957 localMax = 0xC;
958 break;
959 case 'n':
960 localMax = 0xA;
961 break;
962 case 'r':
963 localMax = 0xD;
964 break;
965 case 't':
966 localMax = 0x9;
967 break;
968 case 'v':
969 localMax = 0xB;
970 break;
971 case 'c':
972 if (src < end && RE_IS_LETTER(*src)) {
973 localMax = (uintN) (*src++) & 0x1F;
974 } else {
975 --src;
976 localMax = '\\';
977 }
978 break;
979 case 'x':
980 nDigits = 2;
981 goto lexHex;
982 case 'u':
983 nDigits = 4;
984 lexHex:
985 n = 0;
986 for (i = 0; (i < nDigits) && (src < end); i++) {
987 c = *src++;
988 if (!isASCIIHexDigit(c, &digit)) {
989 /*
990 * Back off to accepting the original
991 *'\' as a literal.
992 */
993 src -= i + 1;
994 n = '\\';
995 break;
996 }
997 n = (n << 4) | digit;
998 }
999 localMax = n;
1000 break;
1001 case 'd':
1002 canStartRange = JS_FALSE;
1003 if (inRange) {
1004 JS_ReportErrorNumber(state->context,
1005 js_GetErrorMessage, NULL,
1006 JSMSG_BAD_CLASS_RANGE);
1007 return JS_FALSE;
1008 }
1009 localMax = '9';
1010 break;
1011 case 'D':
1012 case 's':
1013 case 'S':
1014 case 'w':
1015 case 'W':
1016 canStartRange = JS_FALSE;
1017 if (inRange) {
1018 JS_ReportErrorNumber(state->context,
1019 js_GetErrorMessage, NULL,
1020 JSMSG_BAD_CLASS_RANGE);
1021 return JS_FALSE;
1022 }
1023 max = 65535;
1024
1025 /*
1026 * If this is the start of a range, ensure that it's less than
1027 * the end.
1028 */
1029 localMax = 0;
1030 break;
1031 case '0':
1032 case '1':
1033 case '2':
1034 case '3':
1035 case '4':
1036 case '5':
1037 case '6':
1038 case '7':
1039 /*
1040 * This is a non-ECMA extension - decimal escapes (in this
1041 * case, octal!) are supposed to be an error inside class
1042 * ranges, but supported here for backwards compatibility.
1043 *
1044 */
1045 n = JS7_UNDEC(c);
1046 c = *src;
1047 if ('0' <= c && c <= '7') {
1048 src++;
1049 n = 8 * n + JS7_UNDEC(c);
1050 c = *src;
1051 if ('0' <= c && c <= '7') {
1052 src++;
1053 i = 8 * n + JS7_UNDEC(c);
1054 if (i <= 0377)
1055 n = i;
1056 else
1057 src--;
1058 }
1059 }
1060 localMax = n;
1061 break;
1062
1063 default:
1064 localMax = c;
1065 break;
1066 }
1067 break;
1068 default:
1069 localMax = *src++;
1070 break;
1071 }
1072
1073 if (inRange) {
1074 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1075 if (rangeStart > localMax) {
1076 JS_ReportErrorNumber(state->context,
1077 js_GetErrorMessage, NULL,
1078 JSMSG_BAD_CLASS_RANGE);
1079 return JS_FALSE;
1080 }
1081 inRange = JS_FALSE;
1082 } else {
1083 if (canStartRange && src < end - 1) {
1084 if (*src == '-') {
1085 ++src;
1086 inRange = JS_TRUE;
1087 rangeStart = (jschar)localMax;
1088 continue;
1089 }
1090 }
1091 if (state->flags & JSREG_FOLD)
1092 rangeStart = localMax; /* one run of the uc/dc loop below */
1093 }
1094
1095 if (state->flags & JSREG_FOLD) {
1096 jschar maxch = localMax;
1097
1098 for (i = rangeStart; i <= localMax; i++) {
1099 jschar uch, dch;
1100
1101 uch = upcase(i);
1102 dch = downcase(i);
1103 maxch = JS_MAX(maxch, uch);
1104 maxch = JS_MAX(maxch, dch);
1105 }
1106 localMax = maxch;
1107 }
1108
1109 if (localMax > max)
1110 max = localMax;
1111 }
1112 target->u.ucclass.bmsize = max;
1113 return JS_TRUE;
1114 }
1115
1116 /*
1117 * item: assertion An item is either an assertion or
1118 * quantatom a quantified atom.
1119 *
1120 * assertion: '^' Assertions match beginning of string
1121 * (or line if the class static property
1122 * RegExp.multiline is true).
1123 * '$' End of string (or line if the class
1124 * static property RegExp.multiline is
1125 * true).
1126 * '\b' Word boundary (between \w and \W).
1127 * '\B' Word non-boundary.
1128 *
1129 * quantatom: atom An unquantified atom.
1130 * quantatom '{' n ',' m '}'
1131 * Atom must occur between n and m times.
1132 * quantatom '{' n ',' '}' Atom must occur at least n times.
1133 * quantatom '{' n '}' Atom must occur exactly n times.
1134 * quantatom '*' Zero or more times (same as {0,}).
1135 * quantatom '+' One or more times (same as {1,}).
1136 * quantatom '?' Zero or one time (same as {0,1}).
1137 *
1138 * any of which can be optionally followed by '?' for ungreedy
1139 *
1140 * atom: '(' regexp ')' A parenthesized regexp (what matched
1141 * can be addressed using a backreference,
1142 * see '\' n below).
1143 * '.' Matches any char except '\n'.
1144 * '[' classlist ']' A character class.
1145 * '[' '^' classlist ']' A negated character class.
1146 * '\f' Form Feed.
1147 * '\n' Newline (Line Feed).
1148 * '\r' Carriage Return.
1149 * '\t' Horizontal Tab.
1150 * '\v' Vertical Tab.
1151 * '\d' A digit (same as [0-9]).
1152 * '\D' A non-digit.
1153 * '\w' A word character, [0-9a-z_A-Z].
1154 * '\W' A non-word character.
1155 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1156 * '\S' A non-whitespace character.
1157 * '\' n A backreference to the nth (n decimal
1158 * and positive) parenthesized expression.
1159 * '\' octal An octal escape sequence (octal must be
1160 * two or three digits long, unless it is
1161 * 0 for the null character).
1162 * '\x' hex A hex escape (hex must be two digits).
1163 * '\u' unicode A unicode escape (must be four digits).
1164 * '\c' ctrl A control character, ctrl is a letter.
1165 * '\' literalatomchar Any character except one of the above
1166 * that follow '\' in an atom.
1167 * otheratomchar Any character not first among the other
1168 * atom right-hand sides.
1169 */
1170 static JSBool
1171 ParseTerm(CompilerState *state)
1172 {
1173 jschar c = *state->cp++;
1174 uintN nDigits;
1175 uintN num, tmp, n, i;
1176 const jschar *termStart;
1177
1178 switch (c) {
1179 /* assertions and atoms */
1180 case '^':
1181 state->result = NewRENode(state, REOP_BOL);
1182 if (!state->result)
1183 return JS_FALSE;
1184 state->progLength++;
1185 return JS_TRUE;
1186 case '$':
1187 state->result = NewRENode(state, REOP_EOL);
1188 if (!state->result)
1189 return JS_FALSE;
1190 state->progLength++;
1191 return JS_TRUE;
1192 case '\\':
1193 if (state->cp >= state->cpend) {
1194 /* a trailing '\' is an error */
1195 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1196 return JS_FALSE;
1197 }
1198 c = *state->cp++;
1199 switch (c) {
1200 /* assertion escapes */
1201 case 'b' :
1202 state->result = NewRENode(state, REOP_WBDRY);
1203 if (!state->result)
1204 return JS_FALSE;
1205 state->progLength++;
1206 return JS_TRUE;
1207 case 'B':
1208 state->result = NewRENode(state, REOP_WNONBDRY);
1209 if (!state->result)
1210 return JS_FALSE;
1211 state->progLength++;
1212 return JS_TRUE;
1213 /* Decimal escape */
1214 case '0':
1215 /* Give a strict warning. See also the note below. */
1216 if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT,
1217 JSMSG_INVALID_BACKREF)) {
1218 return JS_FALSE;
1219 }
1220 doOctal:
1221 num = 0;
1222 while (state->cp < state->cpend) {
1223 c = *state->cp;
1224 if (c < '0' || '7' < c)
1225 break;
1226 state->cp++;
1227 tmp = 8 * num + (uintN)JS7_UNDEC(c);
1228 if (tmp > 0377)
1229 break;
1230 num = tmp;
1231 }
1232 c = (jschar)num;
1233 doFlat:
1234 state->result = NewRENode(state, REOP_FLAT);
1235 if (!state->result)
1236 return JS_FALSE;
1237 state->result->u.flat.chr = c;
1238 state->result->u.flat.length = 1;
1239 state->progLength += 3;
1240 break;
1241 case '1':
1242 case '2':
1243 case '3':
1244 case '4':
1245 case '5':
1246 case '6':
1247 case '7':
1248 case '8':
1249 case '9':
1250 termStart = state->cp - 1;
1251 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1252 if (state->flags & JSREG_FIND_PAREN_ERROR)
1253 return JS_FALSE;
1254 if (num == OVERFLOW_VALUE) {
1255 /* Give a strict mode warning. */
1256 if (!ReportRegExpError(state,
1257 JSREPORT_WARNING | JSREPORT_STRICT,
1258 (c >= '8')
1259 ? JSMSG_INVALID_BACKREF
1260 : JSMSG_BAD_BACKREF)) {
1261 return JS_FALSE;
1262 }
1263
1264 /*
1265 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1266 * error here. However, for compatibility with IE, we treat the
1267 * whole backref as flat if the first character in it is not a
1268 * valid octal character, and as an octal escape otherwise.
1269 */
1270 state->cp = termStart;
1271 if (c >= '8') {
1272 /* Treat this as flat. termStart - 1 is the \. */
1273 c = '\\';
1274 goto asFlat;
1275 }
1276
1277 /* Treat this as an octal escape. */
1278 goto doOctal;
1279 }
1280 JS_ASSERT(1 <= num && num <= 0x10000);
1281 state->result = NewRENode(state, REOP_BACKREF);
1282 if (!state->result)
1283 return JS_FALSE;
1284 state->result->u.parenIndex = num - 1;
1285 state->progLength
1286 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1287 break;
1288 /* Control escape */
1289 case 'f':
1290 c = 0xC;
1291 goto doFlat;
1292 case 'n':
1293 c = 0xA;
1294 goto doFlat;
1295 case 'r':
1296 c = 0xD;
1297 goto doFlat;
1298 case 't':
1299 c = 0x9;
1300 goto doFlat;
1301 case 'v':
1302 c = 0xB;
1303 goto doFlat;
1304 /* Control letter */
1305 case 'c':
1306 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1307 c = (jschar) (*state->cp++ & 0x1F);
1308 } else {
1309 /* back off to accepting the original '\' as a literal */
1310 --state->cp;
1311 c = '\\';
1312 }
1313 goto doFlat;
1314 /* HexEscapeSequence */
1315 case 'x':
1316 nDigits = 2;
1317 goto lexHex;
1318 /* UnicodeEscapeSequence */
1319 case 'u':
1320 nDigits = 4;
1321 lexHex:
1322 n = 0;
1323 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1324 uintN digit;
1325 c = *state->cp++;
1326 if (!isASCIIHexDigit(c, &digit)) {
1327 /*
1328 * Back off to accepting the original 'u' or 'x' as a
1329 * literal.
1330 */
1331 state->cp -= i + 2;
1332 n = *state->cp++;
1333 break;
1334 }
1335 n = (n << 4) | digit;
1336 }
1337 c = (jschar) n;
1338 goto doFlat;
1339 /* Character class escapes */
1340 case 'd':
1341 state->result = NewRENode(state, REOP_DIGIT);
1342 doSimple:
1343 if (!state->result)
1344 return JS_FALSE;
1345 state->progLength++;
1346 break;
1347 case 'D':
1348 state->result = NewRENode(state, REOP_NONDIGIT);
1349 goto doSimple;
1350 case 's':
1351 state->result = NewRENode(state, REOP_SPACE);
1352 goto doSimple;
1353 case 'S':
1354 state->result = NewRENode(state, REOP_NONSPACE);
1355 goto doSimple;
1356 case 'w':
1357 state->result = NewRENode(state, REOP_ALNUM);
1358 goto doSimple;
1359 case 'W':
1360 state->result = NewRENode(state, REOP_NONALNUM);
1361 goto doSimple;
1362 /* IdentityEscape */
1363 default:
1364 state->result = NewRENode(state, REOP_FLAT);
1365 if (!state->result)
1366 return JS_FALSE;
1367 state->result->u.flat.chr = c;
1368 state->result->u.flat.length = 1;
1369 state->result->kid = (void *) (state->cp - 1);
1370 state->progLength += 3;
1371 break;
1372 }
1373 break;
1374 case '[':
1375 state->result = NewRENode(state, REOP_CLASS);
1376 if (!state->result)
1377 return JS_FALSE;
1378 termStart = state->cp;
1379 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1380 for (;;) {
1381 if (state->cp == state->cpend) {
1382 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1383 JSMSG_UNTERM_CLASS, termStart);
1384
1385 return JS_FALSE;
1386 }
1387 if (*state->cp == '\\') {
1388 state->cp++;
1389 if (state->cp != state->cpend)
1390 state->cp++;
1391 continue;
1392 }
1393 if (*state->cp == ']') {
1394 state->result->u.ucclass.kidlen = state->cp - termStart;
1395 break;
1396 }
1397 state->cp++;
1398 }
1399 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1400 if (!state->classCache[i].start) {
1401 state->classCache[i].start = termStart;
1402 state->classCache[i].length = state->result->u.ucclass.kidlen;
1403 state->classCache[i].index = state->classCount;
1404 break;
1405 }
1406 if (state->classCache[i].length ==
1407 state->result->u.ucclass.kidlen) {
1408 for (n = 0; ; n++) {
1409 if (n == state->classCache[i].length) {
1410 state->result->u.ucclass.index
1411 = state->classCache[i].index;
1412 goto claim;
1413 }
1414 if (state->classCache[i].start[n] != termStart[n])
1415 break;
1416 }
1417 }
1418 }
1419 state->result->u.ucclass.index = state->classCount++;
1420
1421 claim:
1422 /*
1423 * Call CalculateBitmapSize now as we want any errors it finds
1424 * to be reported during the parse phase, not at execution.
1425 */
1426 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1427 return JS_FALSE;
1428 /*
1429 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1430 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1431 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1432 */
1433 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1434 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1435 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1436 return JS_FALSE;
1437 }
1438 state->classBitmapsMem += n;
1439 /* CLASS, <index> */
1440 state->progLength
1441 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1442 break;
1443
1444 case '.':
1445 state->result = NewRENode(state, REOP_DOT);
1446 goto doSimple;
1447
1448 case '{':
1449 {
1450 const jschar *errp = state->cp--;
1451 intN err;
1452
1453 err = ParseMinMaxQuantifier(state, JS_TRUE);
1454 state->cp = errp;
1455
1456 if (err < 0)
1457 goto asFlat;
1458
1459 /* FALL THROUGH */
1460 }
1461 case '*':
1462 case '+':
1463 case '?':
1464 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1465 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1466 return JS_FALSE;
1467 default:
1468 asFlat:
1469 state->result = NewRENode(state, REOP_FLAT);
1470 if (!state->result)
1471 return JS_FALSE;
1472 state->result->u.flat.chr = c;
1473 state->result->u.flat.length = 1;
1474 state->result->kid = (void *) (state->cp - 1);
1475 state->progLength += 3;
1476 break;
1477 }
1478 return ParseQuantifier(state);
1479 }
1480
1481 static JSBool
1482 ParseQuantifier(CompilerState *state)
1483 {
1484 RENode *term;
1485 term = state->result;
1486 if (state->cp < state->cpend) {
1487 switch (*state->cp) {
1488 case '+':
1489 state->result = NewRENode(state, REOP_QUANT);
1490 if (!state->result)
1491 return JS_FALSE;
1492 state->result->u.range.min = 1;
1493 state->result->u.range.max = (uintN)-1;
1494 /* <PLUS>, <next> ... <ENDCHILD> */
1495 state->progLength += 4;
1496 goto quantifier;
1497 case '*':
1498 state->result = NewRENode(state, REOP_QUANT);
1499 if (!state->result)
1500 return JS_FALSE;
1501 state->result->u.range.min = 0;
1502 state->result->u.range.max = (uintN)-1;
1503 /* <STAR>, <next> ... <ENDCHILD> */
1504 state->progLength += 4;
1505 goto quantifier;
1506 case '?':
1507 state->result = NewRENode(state, REOP_QUANT);
1508 if (!state->result)
1509 return JS_FALSE;
1510 state->result->u.range.min = 0;
1511 state->result->u.range.max = 1;
1512 /* <OPT>, <next> ... <ENDCHILD> */
1513 state->progLength += 4;
1514 goto quantifier;
1515 case '{': /* balance '}' */
1516 {
1517 intN err;
1518 const jschar *errp = state->cp;
1519
1520 err = ParseMinMaxQuantifier(state, JS_FALSE);
1521 if (err == 0)
1522 goto quantifier;
1523 if (err == -1)
1524 return JS_TRUE;
1525
1526 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1527 return JS_FALSE;
1528 }
1529 default:;
1530 }
1531 }
1532 return JS_TRUE;
1533
1534 quantifier:
1535 if (state->treeDepth == TREE_DEPTH_MAX) {
1536 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1537 return JS_FALSE;
1538 }
1539
1540 ++state->treeDepth;
1541 ++state->cp;
1542 state->result->kid = term;
1543 if (state->cp < state->cpend && *state->cp == '?') {
1544 ++state->cp;
1545 state->result->u.range.greedy = JS_FALSE;
1546 } else {
1547 state->result->u.range.greedy = JS_TRUE;
1548 }
1549 return JS_TRUE;
1550 }
1551
1552 static intN
1553 ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues)
1554 {
1555 uintN min, max;
1556 jschar c;
1557 const jschar *errp = state->cp++;
1558
1559 c = *state->cp;
1560 if (JS7_ISDEC(c)) {
1561 ++state->cp;
1562 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1563 c = *state->cp;
1564
1565 if (!ignoreValues && min == OVERFLOW_VALUE)
1566 return JSMSG_MIN_TOO_BIG;
1567
1568 if (c == ',') {
1569 c = *++state->cp;
1570 if (JS7_ISDEC(c)) {
1571 ++state->cp;
1572 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1573 c = *state->cp;
1574 if (!ignoreValues && max == OVERFLOW_VALUE)
1575 return JSMSG_MAX_TOO_BIG;
1576 if (!ignoreValues && min > max)
1577 return JSMSG_OUT_OF_ORDER;
1578 } else {
1579 max = (uintN)-1;
1580 }
1581 } else {
1582 max = min;
1583 }
1584 if (c == '}') {
1585 state->result = NewRENode(state, REOP_QUANT);
1586 if (!state->result)
1587 return JSMSG_OUT_OF_MEMORY;
1588 state->result->u.range.min = min;
1589 state->result->u.range.max = max;
1590 /*
1591 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1592 * where <max> is written as compact(max+1) to make
1593 * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1594 */
1595 state->progLength += (1 + GetCompactIndexWidth(min)
1596 + GetCompactIndexWidth(max + 1)
1597 +3);
1598 return 0;
1599 }
1600 }
1601
1602 state->cp = errp;
1603 return -1;
1604 }
1605
1606 static JSBool
1607 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
1608 {
1609 ptrdiff_t offset = target - jump;
1610
1611 /* Check that target really points forward. */
1612 JS_ASSERT(offset >= 2);
1613 if ((size_t)offset > OFFSET_MAX)
1614 return JS_FALSE;
1615
1616 jump[0] = JUMP_OFFSET_HI(offset);
1617 jump[1] = JUMP_OFFSET_LO(offset);
1618 return JS_TRUE;
1619 }
1620
1621 /* Copy the charset data from a character class node to the charset list
1622 * in the regexp object. */
1623 static JS_ALWAYS_INLINE RECharSet *
1624 InitNodeCharSet(JSRegExp *re, RENode *node)
1625 {
1626 RECharSet *charSet = &re->classList[node->u.ucclass.index];
1627 charSet->converted = JS_FALSE;
1628 charSet->length = node->u.ucclass.bmsize;
1629 charSet->u.src.startIndex = node->u.ucclass.startIndex;
1630 charSet->u.src.length = node->u.ucclass.kidlen;
1631 charSet->sense = node->u.ucclass.sense;
1632 return charSet;
1633 }
1634
1635 /*
1636 * Generate bytecode for the tree rooted at t using an explicit stack instead
1637 * of recursion.
1638 */
1639 static jsbytecode *
1640 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
1641 jsbytecode *pc, RENode *t)
1642 {
1643 EmitStateStackEntry *emitStateSP, *emitStateStack;
1644 REOp op;
1645
1646 if (treeDepth == 0) {
1647 emitStateStack = NULL;
1648 } else {
1649 emitStateStack =
1650 (EmitStateStackEntry *)JS_malloc(state->context,
1651 sizeof(EmitStateStackEntry) *
1652 treeDepth);
1653 if (!emitStateStack)
1654 return NULL;
1655 }
1656 emitStateSP = emitStateStack;
1657 op = t->op;
1658 JS_ASSERT(op < REOP_LIMIT);
1659
1660 for (;;) {
1661 *pc++ = op;
1662 switch (op) {
1663 case REOP_EMPTY:
1664 --pc;
1665 break;
1666
1667 case REOP_ALTPREREQ2:
1668 case REOP_ALTPREREQ:
1669 JS_ASSERT(emitStateSP);
1670 emitStateSP->altHead = pc - 1;
1671 emitStateSP->endTermFixup = pc;
1672 pc += OFFSET_LEN;
1673 SET_ARG(pc, t->u.altprereq.ch1);
1674 pc += ARG_LEN;
1675 SET_ARG(pc, t->u.altprereq.ch2);
1676 pc += ARG_LEN;
1677
1678 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1679 pc += OFFSET_LEN;
1680
1681 emitStateSP->continueNode = t;
1682 emitStateSP->continueOp = REOP_JUMP;
1683 emitStateSP->jumpToJumpFlag = JS_FALSE;
1684 ++emitStateSP;
1685 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1686 t = (RENode *) t->kid;
1687 op = t->op;
1688 JS_ASSERT(op < REOP_LIMIT);
1689 continue;
1690
1691 case REOP_JUMP:
1692 emitStateSP->nextTermFixup = pc; /* offset to following term */
1693 pc += OFFSET_LEN;
1694 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
1695 goto jump_too_big;
1696 emitStateSP->continueOp = REOP_ENDALT;
1697 ++emitStateSP;
1698 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1699 t = (RENode *) t->u.kid2;
1700 op = t->op;
1701 JS_ASSERT(op < REOP_LIMIT);
1702 continue;
1703
1704 case REOP_ENDALT:
1705 /*
1706 * If we already patched emitStateSP->nextTermFixup to jump to
1707 * a nearer jump, to avoid 16-bit immediate offset overflow, we
1708 * are done here.
1709 */
1710 if (emitStateSP->jumpToJumpFlag)
1711 break;
1712
1713 /*
1714 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
1715 * REOP_ENDALT is executed only on successful match of the last
1716 * alternate in a group.
1717 */
1718 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1719 goto jump_too_big;
1720 if (t->op != REOP_ALT) {
1721 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
1722 goto jump_too_big;
1723 }
1724
1725 /*
1726 * If the program is bigger than the REOP_JUMP offset range, then
1727 * we must check for alternates before this one that are part of
1728 * the same group, and fix up their jump offsets to target jumps
1729 * close enough to fit in a 16-bit unsigned offset immediate.
1730 */
1731 if ((size_t)(pc - re->program) > OFFSET_MAX &&
1732 emitStateSP > emitStateStack) {
1733 EmitStateStackEntry *esp, *esp2;
1734 jsbytecode *alt, *jump;
1735 ptrdiff_t span, header;
1736
1737 esp2 = emitStateSP;
1738 alt = esp2->altHead;
1739 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
1740 if (esp->continueOp == REOP_ENDALT &&
1741 !esp->jumpToJumpFlag &&
1742 esp->nextTermFixup + OFFSET_LEN == alt &&
1743 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
1744 ? esp->endTermFixup
1745 : esp->nextTermFixup)) > OFFSET_MAX) {
1746 alt = esp->altHead;
1747 jump = esp->nextTermFixup;
1748
1749 /*
1750 * The span must be 1 less than the distance from
1751 * jump offset to jump offset, so we actually jump
1752 * to a REOP_JUMP bytecode, not to its offset!
1753 */
1754 for (;;) {
1755 JS_ASSERT(jump < esp2->nextTermFixup);
1756 span = esp2->nextTermFixup - jump - 1;
1757 if ((size_t)span <= OFFSET_MAX)
1758 break;
1759 do {
1760 if (--esp2 == esp)
1761 goto jump_too_big;
1762 } while (esp2->continueOp != REOP_ENDALT);
1763 }
1764
1765 jump[0] = JUMP_OFFSET_HI(span);
1766 jump[1] = JUMP_OFFSET_LO(span);
1767
1768 if (esp->continueNode->op != REOP_ALT) {
1769 /*
1770 * We must patch the offset at esp->endTermFixup
1771 * as well, for the REOP_ALTPREREQ{,2} opcodes.
1772 * If we're unlucky and endTermFixup is more than
1773 * OFFSET_MAX bytes from its target, we cheat by
1774 * jumping 6 bytes to the jump whose offset is at
1775 * esp->nextTermFixup, which has the same target.
1776 */
1777 jump = esp->endTermFixup;
1778 header = esp->nextTermFixup - jump;
1779 span += header;
1780 if ((size_t)span > OFFSET_MAX)
1781 span = header;
1782
1783 jump[0] = JUMP_OFFSET_HI(span);
1784 jump[1] = JUMP_OFFSET_LO(span);
1785 }
1786
1787 esp->jumpToJumpFlag = JS_TRUE;
1788 }
1789 }
1790 }
1791 break;
1792
1793 case REOP_ALT:
1794 JS_ASSERT(emitStateSP);
1795 emitStateSP->altHead = pc - 1;
1796 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1797 pc += OFFSET_LEN;
1798 emitStateSP->continueNode = t;
1799 emitStateSP->continueOp = REOP_JUMP;
1800 emitStateSP->jumpToJumpFlag = JS_FALSE;
1801 ++emitStateSP;
1802 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1803 t = (RENode *) t->kid;
1804 op = t->op;
1805 JS_ASSERT(op < REOP_LIMIT);
1806 continue;
1807
1808 case REOP_FLAT:
1809 /*
1810 * Coalesce FLATs if possible and if it would not increase bytecode
1811 * beyond preallocated limit. The latter happens only when bytecode
1812 * size for coalesced string with offset p and length 2 exceeds 6
1813 * bytes preallocated for 2 single char nodes, i.e. when
1814 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
1815 * GetCompactIndexWidth(p) > 4.
1816 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
1817 * nodes strictly decreases bytecode size, the check has to be
1818 * done only for the first coalescing.
1819 */
1820 if (t->kid &&
1821 GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4)
1822 {
1823 while (t->next &&
1824 t->next->op == REOP_FLAT &&
1825 (jschar*)t->kid + t->u.flat.length ==
1826 (jschar*)t->next->kid) {
1827 t->u.flat.length += t->next->u.flat.length;
1828 t->next = t->next->next;
1829 }
1830 }
1831 if (t->kid && t->u.flat.length > 1) {
1832 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
1833 pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin);
1834 pc = WriteCompactIndex(pc, t->u.flat.length);
1835 } else if (t->u.flat.chr < 256) {
1836 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
1837 *pc++ = (jsbytecode) t->u.flat.chr;
1838 } else {
1839 pc[-1] = (state->flags & JSREG_FOLD)
1840 ? REOP_UCFLAT1i
1841 : REOP_UCFLAT1;
1842 SET_ARG(pc, t->u.flat.chr);
1843 pc += ARG_LEN;
1844 }
1845 break;
1846
1847 case REOP_LPAREN:
1848 JS_ASSERT(emitStateSP);
1849 pc = WriteCompactIndex(pc, t->u.parenIndex);
1850 emitStateSP->continueNode = t;
1851 emitStateSP->continueOp = REOP_RPAREN;
1852 ++emitStateSP;
1853 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1854 t = (RENode *) t->kid;
1855 op = t->op;
1856 continue;
1857
1858 case REOP_RPAREN:
1859 pc = WriteCompactIndex(pc, t->u.parenIndex);
1860 break;
1861
1862 case REOP_BACKREF:
1863 pc = WriteCompactIndex(pc, t->u.parenIndex);
1864 break;
1865
1866 case REOP_ASSERT:
1867 JS_ASSERT(emitStateSP);
1868 emitStateSP->nextTermFixup = pc;
1869 pc += OFFSET_LEN;
1870 emitStateSP->continueNode = t;
1871 emitStateSP->continueOp = REOP_ASSERTTEST;
1872 ++emitStateSP;
1873 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1874 t = (RENode *) t->kid;
1875 op = t->op;
1876 continue;
1877
1878 case REOP_ASSERTTEST:
1879 case REOP_ASSERTNOTTEST:
1880 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1881 goto jump_too_big;
1882 break;
1883
1884 case REOP_ASSERT_NOT:
1885 JS_ASSERT(emitStateSP);
1886 emitStateSP->nextTermFixup = pc;
1887 pc += OFFSET_LEN;
1888 emitStateSP->continueNode = t;
1889 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
1890 ++emitStateSP;
1891 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1892 t = (RENode *) t->kid;
1893 op = t->op;
1894 continue;
1895
1896 case REOP_QUANT:
1897 JS_ASSERT(emitStateSP);
1898 if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) {
1899 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
1900 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
1901 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
1902 } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) {
1903 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
1904 } else {
1905 if (!t->u.range.greedy)
1906 pc[-1] = REOP_MINIMALQUANT;
1907 pc = WriteCompactIndex(pc, t->u.range.min);
1908 /*
1909 * Write max + 1 to avoid using size_t(max) + 1 bytes
1910 * for (uintN)-1 sentinel.
1911 */
1912 pc = WriteCompactIndex(pc, t->u.range.max + 1);
1913 }
1914 emitStateSP->nextTermFixup = pc;
1915 pc += OFFSET_LEN;
1916 emitStateSP->continueNode = t;
1917 emitStateSP->continueOp = REOP_ENDCHILD;
1918 ++emitStateSP;
1919 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1920 t = (RENode *) t->kid;
1921 op = t->op;
1922 continue;
1923
1924 case REOP_ENDCHILD:
1925 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1926 goto jump_too_big;
1927 break;
1928
1929 case REOP_CLASS:
1930 if (!t->u.ucclass.sense)
1931 pc[-1] = REOP_NCLASS;
1932 pc = WriteCompactIndex(pc, t->u.ucclass.index);
1933 InitNodeCharSet(re, t);
1934 break;
1935
1936 default:
1937 break;
1938 }
1939
1940 t = t->next;
1941 if (t) {
1942 op = t->op;
1943 } else {
1944 if (emitStateSP == emitStateStack)
1945 break;
1946 --emitStateSP;
1947 t = emitStateSP->continueNode;
1948 op = (REOp) emitStateSP->continueOp;
1949 }
1950 }
1951
1952 cleanup:
1953 if (emitStateStack)
1954 JS_free(state->context, emitStateStack);
1955 return pc;
1956
1957 jump_too_big:
1958 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1959 pc = NULL;
1960 goto cleanup;
1961 }
1962
1963 #ifdef JS_TRACER
1964 typedef List<LIns*, LIST_NonGCObjects> LInsList;
1965
1966 /* Dummy GC for nanojit placement new. */
1967 static GC gc;
1968
1969 class RegExpNativeCompiler {
1970 private:
1971 JSRegExp* re; /* Careful: not fully initialized */
1972 CompilerState* cs; /* RegExp to compile */
1973 Fragment* fragment;
1974 LirWriter* lir;
1975
1976 LIns* state;
1977 LIns* gdata;
1978 LIns* cpend;
1979
1980 JSBool isCaseInsensitive() const { return cs->flags & JSREG_FOLD; }
1981
1982 void targetCurrentPoint(LIns* ins) { ins->target(lir->ins0(LIR_label)); }
1983
1984 void targetCurrentPoint(LInsList& fails)
1985 {
1986 LIns* fail = lir->ins0(LIR_label);
1987 for (size_t i = 0; i < fails.size(); ++i) {
1988 fails[i]->target(fail);
1989 }
1990 fails.clear();
1991 }
1992
1993 /*
1994 * These functions return the new position after their match operation,
1995 * or NULL if there was an error.
1996 */
1997 LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails)
1998 {
1999 return pos;
2000 }
2001
2002 LIns* compileFlatSingleChar(RENode* node, LIns* pos, LInsList& fails)
2003 {
2004 /*
2005 * Fast case-insensitive test for ASCII letters: convert text
2006 * char to lower case by bit-or-ing in 32 and compare.
2007 */
2008 JSBool useFastCI = JS_FALSE;
2009 jschar ch = node->u.flat.chr; /* char to test for */
2010 jschar ch2 = ch; /* 2nd char to test for if ci */
2011 if (cs->flags & JSREG_FOLD) {
2012 if (L'A' <= ch && ch <= L'Z' || L'a' <= ch || ch <= L'z') {
2013 ch |= 32;
2014 ch2 = ch;
2015 useFastCI = JS_TRUE;
2016 } else if (JS_TOLOWER(ch) != ch) {
2017 ch2 = JS_TOLOWER(ch);
2018 ch = JS_TOUPPER(ch);
2019 }
2020 }
2021
2022 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0);
2023 fails.add(to_fail);
2024 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0));
2025 LIns* comp_ch = useFastCI ?
2026 lir->ins2(LIR_or, text_ch, lir->insImm(32)) :
2027 text_ch;
2028 if (ch == ch2) {
2029 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0));
2030 } else {
2031 LIns* to_ok = lir->insBranch(LIR_jt, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0);
2032 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch2)), 0));
2033 targetCurrentPoint(to_ok);
2034 }
2035
2036 return lir->ins2(LIR_piadd, pos, lir->insImm(2));
2037 }
2038
2039 LIns* compileClass(RENode* node, LIns* pos, LInsList& fails)
2040 {
2041 if (!node->u.ucclass.sense)
2042 return JS_FALSE;
2043
2044 RECharSet* charSet = InitNodeCharSet(re, node);
2045 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0);
2046 fails.add(to_fail);
2047 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0));
2048 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_le, text_ch, lir->insImm(charSet->length)), 0));
2049 LIns* byteIndex = lir->ins2(LIR_rsh, text_ch, lir->insImm(3));
2050 LIns* bitmap = lir->insLoad(LIR_ld, lir->insImmPtr(charSet), (int) offsetof(RECharSet, u.bits));
2051 LIns* byte = lir->insLoad(LIR_ldcb, lir->ins2(LIR_piadd, bitmap, byteIndex), (int) 0);
2052 LIns* bitMask = lir->ins2(LIR_lsh, lir->insImm(1),
2053 lir->ins2(LIR_and, text_ch, lir->insImm(0x7)));
2054 LIns* test = lir->ins2(LIR_eq, lir->ins2(LIR_and, byte, bitMask), lir->insImm(0));
2055
2056 LIns* to_next = lir->insBranch(LIR_jt, test, 0);
2057 fails.add(to_next);
2058 return lir->ins2(LIR_piadd, pos, lir->insImm(2));
2059 }
2060
2061 LIns* compileAlt(RENode* node, LIns* pos, LInsList& fails)
2062 {
2063 LInsList kidFails(NULL);
2064 if (!compileNode((RENode *) node->kid, pos, kidFails))
2065 return JS_FALSE;
2066 if (!compileNode(node->next, pos, kidFails))
2067 return JS_FALSE;
2068
2069 targetCurrentPoint(kidFails);
2070 if (!compileNode(node->u.altprereq.kid2, pos, fails))
2071 return JS_FALSE;
2072 /*
2073 * Disable compilation for any regexp where something follows an
2074 * alternation. To make this work, we need to redesign to either
2075 * (a) pass around continuations so we know the right place to go
2076 * when we logically return, or (b) generate explicit backtracking
2077 * code.
2078 */
2079 if (node->next)
2080 return JS_FALSE;
2081 return pos;
2082 }
2083
2084 JSBool compileNode(RENode* node, LIns* pos, LInsList& fails)
2085 {
2086 for (; node; node = node->next) {
2087 if (fragment->lirbuf->outOmem())
2088 return JS_FALSE;
2089
2090 switch (node->op) {
2091 case REOP_EMPTY:
2092 pos = compileEmpty(node, pos, fails);
2093 break;
2094 case REOP_FLAT:
2095 if (node->u.flat.length != 1)
2096 return JS_FALSE;
2097 pos = compileFlatSingleChar(node, pos, fails);
2098 break;
2099 case REOP_ALT:
2100 case REOP_ALTPREREQ:
2101 pos = compileAlt(node, pos, fails);
2102 break;
2103 case REOP_CLASS:
2104 pos = compileClass(node, pos, fails);
2105 break;
2106 default:
2107 return JS_FALSE;
2108 }
2109 if (!pos)
2110 return JS_FALSE;
2111 }
2112
2113 lir->insStorei(pos, state, (int) offsetof(REMatchState, cp));
2114 lir->ins1(LIR_ret, state);
2115 return JS_TRUE;
2116 }
2117
2118 JSBool compileSticky(RENode* root, LIns* start)
2119 {
2120 LInsList fails(NULL);
2121 if (!compileNode(root, start, fails))
2122 return JS_FALSE;
2123 targetCurrentPoint(fails);
2124 lir->ins1(LIR_ret, lir->insImm(0));
2125 return JS_TRUE;
2126 }
2127
2128 JSBool compileAnchoring(RENode* root, LIns* start)
2129 {
2130 /* Even at the end, the empty regexp would match. */
2131 LIns* to_next = lir->insBranch(LIR_jf,
2132 lir->ins2(LIR_le, start, cpend), 0);
2133 LInsList fails(NULL);
2134 if (!compileNode(root, start, fails))
2135 return JS_FALSE;
2136
2137 targetCurrentPoint(to_next);
2138 lir->ins1(LIR_ret, lir->insImm(0));
2139
2140 targetCurrentPoint(fails);
2141 lir->insStorei(lir->ins2(LIR_piadd, start, lir->insImm(2)), gdata,
2142 (int) offsetof(REGlobalData, skipped));
2143
2144 return JS_TRUE;
2145 }
2146
2147 inline LIns*
2148 addName(LirBuffer* lirbuf, LIns* ins, const char* name)
2149 {
2150 debug_only_v(lirbuf->names->addName(ins, name);)
2151 return ins;
2152 }
2153
2154 public:
2155 RegExpNativeCompiler(JSRegExp *re, CompilerState *cs)
2156 : re(re), cs(cs), fragment(NULL) { }
2157
2158 JSBool compile(JSContext* cx)
2159 {
2160 GuardRecord* guard;
2161 LIns* skip;
2162 LIns* start;
2163 bool oom = false;
2164
2165 Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento;
2166 fragment = fragmento->getLoop(re);
2167 if (!fragment) {
2168 fragment = fragmento->getAnchor(re);
2169 fragment->lirbuf = new (&gc) LirBuffer(fragmento, NULL);
2170 /* Scary: required to have the onDestroy method delete the lirbuf. */
2171 fragment->root = fragment;
2172 }
2173 LirBuffer* lirbuf = fragment->lirbuf;
2174 LirBufWriter* lirb;
2175 if (lirbuf->outOmem()) goto fail2;
2176 /* FIXME Use bug 463260 smart pointer when available. */
2177 lir = lirb = new (&gc) LirBufWriter(lirbuf);
2178
2179 /* FIXME Use bug 463260 smart pointer when available. */
2180 debug_only_v(fragment->lirbuf->names = new (&gc) LirNameMap(&gc, NULL, fragmento->labels);)
2181 /* FIXME Use bug 463260 smart pointer when available. */
2182 debug_only_v(lir = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);)
2183
2184 lir->ins0(LIR_start);
2185 lirbuf->state = state = addName(lirbuf, lir->insParam(0, 0), "state");
2186 lirbuf->param1 = gdata = addName(lirbuf, lir->insParam(1, 0), "gdata");
2187 start = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, (int) offsetof(REGlobalData, skipped)), "start");
2188 cpend = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, offsetof(REGlobalData, cpend)), "cpend");
2189
2190 if (cs->flags & JSREG_STICKY) {
2191 if (!compileSticky(cs->result, start)) goto fail;
2192 } else {
2193 if (!compileAnchoring(cs->result, start)) goto fail;
2194 }
2195
2196 /* Create fake guard record for loop edge. */
2197 skip = lirb->skip(sizeof(GuardRecord) + sizeof(SideExit));
2198 guard = (GuardRecord *) skip->payload();
2199 memset(guard, 0, sizeof(*guard));
2200 guard->exit = (SideExit *) guard+1;
2201 guard->exit->target = fragment;
2202 fragment->lastIns = lir->insGuard(LIR_loop, lir->insImm(1), skip);
2203
2204 ::compile(fragmento->assm(), fragment);
2205 if (fragmento->assm()->error() != nanojit::None) {
2206 oom = fragmento->assm()->error() == nanojit::OutOMem;
2207 goto fail;
2208 }
2209
2210 delete lirb;
2211 debug_only_v(delete lir;)
2212 return JS_TRUE;
2213 fail:
2214 delete lirb;
2215 debug_only_v(delete lir;)
2216 fail2:
2217 if (lirbuf->outOmem() || oom)
2218 fragmento->clearFrags();
2219 return JS_FALSE;
2220 }
2221 };
2222
2223 static inline JSBool
2224 js_CompileRegExpToNative(JSContext *cx, JSRegExp *re, CompilerState *cs)
2225 {
2226 RegExpNativeCompiler rc(re, cs);
2227 return rc.compile(cx);
2228 }
2229 #endif
2230
2231 JSRegExp *
2232 js_NewRegExp(JSContext *cx, JSTokenStream *ts,
2233 JSString *str, uintN flags, JSBool flat)
2234 {
2235 JSRegExp *re;
2236 void *mark;
2237 CompilerState state;
2238 size_t resize;
2239 jsbytecode *endPC;
2240 uintN i;
2241 size_t len;
2242
2243 re = NULL;
2244 mark = JS_ARENA_MARK(&cx->tempPool);
2245 len = JSSTRING_LENGTH(str);
2246
2247 state.context = cx;
2248 state.tokenStream = ts;
2249 state.cp = js_UndependString(cx, str);
2250 if (!state.cp)
2251 goto out;
2252 state.cpbegin = state.cp;
2253 state.cpend = state.cp + len;
2254 state.flags = flags;
2255 state.parenCount = 0;
2256 state.classCount = 0;
2257 state.progLength = 0;
2258 state.treeDepth = 0;
2259 state.classBitmapsMem = 0;
2260 for (i = 0; i < CLASS_CACHE_SIZE; i++)
2261 state.classCache[i].start = NULL;
2262
2263 if (len != 0 && flat) {
2264 state.result = NewRENode(&state, REOP_FLAT);
2265 if (!state.result)
2266 goto out;
2267 state.result->u.flat.chr = *state.cpbegin;
2268 state.result->u.flat.length = len;
2269 state.result->kid = (void *) state.cpbegin;
2270 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
2271 state.progLength += 1 + GetCompactIndexWidth(0)
2272 + GetCompactIndexWidth(len);
2273 } else {
2274 if (!ParseRegExp(&state))
2275 goto out;
2276 }
2277
2278 resize = offsetof(JSRegExp, program) + state.progLength + 1;
2279 re = (JSRegExp *) JS_malloc(cx, resize);
2280 if (!re)
2281 goto out;
2282
2283 re->nrefs = 1;
2284 JS_ASSERT(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
2285 re->classCount = state.classCount;
2286 if (re->classCount) {
2287 re->classList = (RECharSet *)
2288 JS_malloc(cx, re->classCount * sizeof(RECharSet));
2289 if (!re->classList) {
2290 js_DestroyRegExp(cx, re);
2291 re = NULL;
2292 goto out;
2293 }
2294 for (i = 0; i < re->classCount; i++)
2295 re->classList[i].converted = JS_FALSE;
2296 } else {
2297 re->classList = NULL;
2298 }
2299
2300 #ifdef JS_TRACER
2301 /*
2302 * Try compiling the native code version. For the time being we also
2303 * compile the bytecode version in case we evict the native code
2304 * version from the code cache.
2305 */
2306 if (TRACING_ENABLED(cx))
2307 js_CompileRegExpToNative(cx, re, &state);
2308 #endif
2309 /* Compile the bytecode version. */
2310 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
2311 if (!endPC) {
2312 js_DestroyRegExp(cx, re);
2313 re = NULL;
2314 goto out;
2315 }
2316 *endPC++ = REOP_END;
2317 /*
2318 * Check whether size was overestimated and shrink using realloc.
2319 * This is safe since no pointers to newly parsed regexp or its parts
2320 * besides re exist here.
2321 */
2322 #if 0
2323 /*
2324 * FIXME: Until bug 464866 is fixed, we can't move the re object so
2325 * don't shrink it for now.
2326 */
2327 if ((size_t)(endPC - re->program) != state.progLength + 1) {
2328 JSRegExp *tmp;
2329 JS_ASSERT((size_t)(endPC - re->program) < state.progLength + 1);
2330 resize = offsetof(JSRegExp, program) + (endPC - re->program);
2331 tmp = (JSRegExp *) JS_realloc(cx, re, resize);
2332 if (tmp)
2333 re = tmp;
2334 }
2335 #endif
2336
2337 re->flags = flags;
2338 re->parenCount = state.parenCount;
2339 re->source = str;
2340
2341 out:
2342 JS_ARENA_RELEASE(&cx->tempPool, mark);
2343 return re;
2344 }
2345
2346 JSRegExp *
2347 js_NewRegExpOpt(JSContext *cx, JSString *str, JSString *opt, JSBool flat)
2348 {
2349 uintN flags;
2350 jschar *s;
2351 size_t i, n;
2352 char charBuf[2];
2353
2354 flags = 0;
2355 if (opt) {
2356 JSSTRING_CHARS_AND_LENGTH(opt, s, n);
2357 for (i = 0; i < n; i++) {
2358 switch (s[i]) {
2359 case 'g':
2360 flags |= JSREG_GLOB;
2361 break;
2362 case 'i':
2363 flags |= JSREG_FOLD;
2364 break;
2365 case 'm':
2366 flags |= JSREG_MULTILINE;
2367 break;
2368 case 'y':
2369 flags |= JSREG_STICKY;
2370 break;
2371 default:
2372 charBuf[0] = (char)s[i];
2373 charBuf[1] = '\0';
2374 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR,
2375 js_GetErrorMessage, NULL,
2376 JSMSG_BAD_FLAG, charBuf);
2377 return NULL;
2378 }
2379 }
2380 }
2381 return js_NewRegExp(cx, NULL, str, flags, flat);
2382 }
2383
2384 /*
2385 * Save the current state of the match - the position in the input
2386 * text as well as the position in the bytecode. The state of any
2387 * parent expressions is also saved (preceding state).
2388 * Contents of parenCount parentheses from parenIndex are also saved.
2389 */
2390 static REBackTrackData *
2391 PushBackTrackState(REGlobalData *gData, REOp op,
2392 jsbytecode *target, REMatchState *x, const jschar *cp,
2393 size_t parenIndex, size_t parenCount)
2394 {
2395 size_t i;
2396 REBackTrackData *result =
2397 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
2398
2399 size_t sz = sizeof(REBackTrackData) +
2400 gData->stateStackTop * sizeof(REProgState) +
2401 parenCount * sizeof(RECapture);
2402
2403 ptrdiff_t btsize = gData->backTrackStackSize;
2404 ptrdiff_t btincr = ((char *)result + sz) -
2405 ((char *)gData->backTrackStack + btsize);
2406
2407 re_debug("\tBT_Push: %lu,%lu",
2408 (unsigned long) parenIndex, (unsigned long) parenCount);
2409
2410 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
2411 if (btincr > 0) {
2412 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
2413
2414 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
2415 btincr = JS_ROUNDUP(btincr, btsize);
2416 JS_ARENA_GROW_CAST(gData->backTrackStack, REBackTrackData *,
2417 &gData->cx->regexpPool, btsize, btincr);
2418 if (!gData->backTrackStack) {
2419 js_ReportOutOfScriptQuota(gData->cx);
2420 gData->ok = JS_FALSE;
2421 return NULL;
2422 }
2423 gData->backTrackStackSize = btsize + btincr;
2424 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2425 }
2426 gData->backTrackSP = result;
2427 result->sz = gData->cursz;
2428 gData->cursz = sz;
2429
2430 result->backtrack_op = op;
2431 result->backtrack_pc = target;
2432 result->cp = cp;
2433 result->parenCount = parenCount;
2434 result->parenIndex = parenIndex;
2435
2436 result->saveStateStackTop = gData->stateStackTop;
2437 JS_ASSERT(gData->stateStackTop);
2438 memcpy(result + 1, gData->stateStack,
2439 sizeof(REProgState) * result->saveStateStackTop);
2440
2441 if (parenCount != 0) {
2442 memcpy((char *)(result + 1) +
2443 sizeof(REProgState) * result->saveStateStackTop,
2444 &x->parens[parenIndex],
2445 sizeof(RECapture) * parenCount);
2446 for (i = 0; i != parenCount; i++)
2447 x->parens[parenIndex + i].index = -1;
2448 }
2449
2450 return result;
2451 }
2452
2453
2454 /*
2455 * Consecutive literal characters.
2456 */
2457 #if 0
2458 static REMatchState *
2459 FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
2460 size_t length)
2461 {
2462 size_t i;
2463 if (length > gData->cpend - x->cp)
2464 return NULL;
2465 for (i = 0; i != length; i++) {
2466 if (matchChars[i] != x->cp[i])
2467 return NULL;
2468 }
2469 x->cp += length;
2470 return x;
2471 }
2472 #endif
2473
2474 static JS_ALWAYS_INLINE REMatchState *
2475 FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
2476 size_t length)
2477 {
2478 size_t i;
2479 JS_ASSERT(gData->cpend >= x->cp);
2480 if (length > (size_t)(gData->cpend - x->cp))
2481 return NULL;
2482 for (i = 0; i != length; i++) {
2483 if (upcase(matchChars[i]) != upcase(x->cp[i]))
2484 return NULL;
2485 }
2486 x->cp += length;
2487 return x;
2488 }
2489
2490 /*
2491 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2492 * 2. If E is not a character then go to step 6.
2493 * 3. Let ch be E's character.
2494 * 4. Let A be a one-element RECharSet containing the character ch.
2495 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2496 * 6. E must be an integer. Let n be that integer.
2497 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2498 * 8. Return an internal Matcher closure that takes two arguments, a State x
2499 * and a Continuation c, and performs the following:
2500 * 1. Let cap be x's captures internal array.
2501 * 2. Let s be cap[n].
2502 * 3. If s is undefined, then call c(x) and return its result.
2503 * 4. Let e be x's endIndex.
2504 * 5. Let len be s's length.
2505 * 6. Let f be e+len.
2506 * 7. If f>InputLength, return failure.
2507 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2508 * such that Canonicalize(s[i]) is not the same character as
2509 * Canonicalize(Input [e+i]), then return failure.
2510 * 9. Let y be the State (f, cap).
2511 * 10. Call c(y) and return its result.
2512 */
2513 static REMatchState *
2514 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2515 {
2516 size_t len, i;
2517 const jschar *parenContent;
2518 RECapture *cap = &x->parens[parenIndex];
2519
2520 if (cap->index == -1)
2521 return x;
2522
2523 len = cap->length;
2524 if (x->cp + len > gData->cpend)
2525 return NULL;
2526
2527 parenContent = &gData->cpbegin[cap->index];
2528 if (gData->regexp->flags & JSREG_FOLD) {
2529 for (i = 0; i < len; i++) {
2530 if (upcase(parenContent[i]) != upcase(x->cp[i]))
2531 return NULL;
2532 }
2533 } else {
2534 for (i = 0; i < len; i++) {
2535 if (parenContent[i] != x->cp[i])
2536 return NULL;
2537 }
2538 }
2539 x->cp += len;
2540 return x;
2541 }
2542
2543
2544 /* Add a single character to the RECharSet */
2545 static void
2546 AddCharacterToCharSet(RECharSet *cs, jschar c)
2547 {
2548 uintN byteIndex = (uintN)(c >> 3);
2549 JS_ASSERT(c <= cs->length);
2550 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2551 }
2552
2553
2554 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2555 static void
2556 AddCharacterRangeToCharSet(RECharSet *cs, uintN c1, uintN c2)
2557 {
2558 uintN i;
2559
2560 uintN byteIndex1 = c1 >> 3;
2561 uintN byteIndex2 = c2 >> 3;
2562
2563 JS_ASSERT(c2 <= cs->length && c1 <= c2);
2564
2565 c1 &= 0x7;
2566 c2 &= 0x7;
2567
2568 if (byteIndex1 == byteIndex2) {
2569 cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1;
2570 } else {
2571 cs->u.bits[byteIndex1] |= 0xFF << c1;
2572 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2573 cs->u.bits[i] = 0xFF;
2574 cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2);
2575 }
2576 }
2577
2578 struct CharacterRange {
2579 jschar start;
2580 jschar end;
2581 };
2582
2583 /*
2584 * The following characters are taken from the ECMA-262 standard, section 7.2
2585 * and 7.3, and the Unicode 3 standard, Table 6-1.
2586 */
2587 static const CharacterRange WhiteSpaceRanges[] = {
2588 /* TAB, LF, VT, FF, CR */
2589 { 0x0009, 0x000D },
2590 /* SPACE */
2591 { 0x0020, 0x0020 },
2592 /* NO-BREAK SPACE */
2593 { 0x00A0, 0x00A0 },
2594 /*
2595 * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM
2596 * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE,
2597 * HAIR SPACE, ZERO WIDTH SPACE
2598 */
2599 { 0x2000, 0x200B },
2600 /* LS, PS */
2601 { 0x2028, 0x2029 },
2602 /* NARROW NO-BREAK SPACE */
2603 { 0x202F, 0x202F },
2604 /* IDEOGRAPHIC SPACE */
2605 { 0x3000, 0x3000 }
2606 };
2607
2608 /* ECMA-262 standard, section 15.10.2.6. */
2609 static const CharacterRange WordRanges[] = {
2610 { jschar('0'), jschar('9') },
2611 { jschar('A'), jschar('Z') },
2612 { jschar('_'), jschar('_') },
2613 { jschar('a'), jschar('z') }
2614 };
2615
2616 static void
2617 AddCharacterRanges(RECharSet *charSet,
2618 const CharacterRange *range,
2619 const CharacterRange *end)
2620 {
2621 for (; range < end; ++range)
2622 AddCharacterRangeToCharSet(charSet, range->start, range->end);
2623 }
2624
2625 static void
2626 AddInvertedCharacterRanges(RECharSet *charSet,
2627 const CharacterRange *range,
2628 const CharacterRange *end)
2629 {
2630 uint16 previous = 0;
2631 for (; range < end; ++range) {
2632 AddCharacterRangeToCharSet(charSet, previous, range->start - 1);
2633 previous = range->end + 1;
2634 }
2635 AddCharacterRangeToCharSet(charSet, previous, charSet->length);
2636 }
2637
2638 /* Compile the source of the class into a RECharSet */
2639 static JSBool
2640 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2641 {
2642 const jschar *src, *end;
2643 JSBool inRange = JS_FALSE;
2644 jschar rangeStart = 0;
2645 uintN byteLength, n;
2646 jschar c, thisCh;
2647 intN nDigits, i;
2648
2649 JS_ASSERT(!charSet->converted);
2650 /*
2651 * Assert that startIndex and length points to chars inside [] inside
2652 * source string.
2653 */
2654 JS_ASSERT(1 <= charSet->u.src.startIndex);
2655 JS_ASSERT(charSet->u.src.startIndex
2656 < JSSTRING_LENGTH(gData->regexp->source));
2657 JS_ASSERT(charSet->u.src.length <= JSSTRING_LENGTH(gData->regexp->source)
2658 - 1 - charSet->u.src.startIndex);
2659
2660 charSet->converted = JS_TRUE;
2661 src = JSSTRING_CHARS(gData->regexp->source) + charSet->u.src.startIndex;
2662 end = src + charSet->u.src.length;
2663 JS_ASSERT(src[-1] == '[');
2664 JS_ASSERT(end[0] == ']');
2665
2666 byteLength = (charSet->length >> 3) + 1;
2667 charSet->u.bits = (uint8 *)JS_malloc(gData->cx, byteLength);
2668 if (!charSet->u.bits) {
2669 JS_ReportOutOfMemory(gData->cx);
2670 gData->ok = JS_FALSE;
2671 return JS_FALSE;
2672 }
2673 memset(charSet->u.bits, 0, byteLength);
2674
2675 if (src == end)
2676 return JS_TRUE;
2677
2678 if (*src == '^') {
2679 JS_ASSERT(charSet->sense == JS_FALSE);
2680 ++src;
2681 } else {
2682 JS_ASSERT(charSet->sense == JS_TRUE);
2683 }
2684
2685 while (src != end) {
2686 switch (*src) {
2687 case '\\':
2688 ++src;
2689 c = *src++;
2690 switch (c) {
2691 case 'b':
2692 thisCh = 0x8;
2693 break;
2694 case 'f':
2695 thisCh = 0xC;
2696 break;
2697 case 'n':
2698 thisCh = 0xA;
2699 break;
2700 case 'r':
2701 thisCh = 0xD;
2702 break;
2703 case 't':
2704 thisCh = 0x9;
2705 break;
2706 case 'v':
2707 thisCh = 0xB;
2708 break;
2709 case 'c':
2710 if (src < end && JS_ISWORD(*src)) {
2711 thisCh = (jschar)(*src++ & 0x1F);
2712 } else {
2713 --src;
2714 thisCh = '\\';
2715 }
2716 break;
2717 case 'x':
2718 nDigits = 2;
2719 goto lexHex;
2720 case 'u':
2721 nDigits = 4;
2722 lexHex:
2723 n = 0;
2724 for (i = 0; (i < nDigits) && (src < end); i++) {
2725 uintN digit;
2726 c = *src++;
2727 if (!isASCIIHexDigit(c, &digit)) {
2728 /*
2729 * Back off to accepting the original '\'
2730 * as a literal
2731 */
2732 src -= i + 1;
2733 n = '\\';
2734 break;
2735 }
2736 n = (n << 4) | digit;
2737 }
2738 thisCh = (jschar)n;
2739 break;
2740 case '0':
2741 case '1':
2742 case '2':
2743 case '3':
2744 case '4':
2745 case '5':
2746 case '6':
2747 case '7':
2748 /*
2749 * This is a non-ECMA extension - decimal escapes (in this
2750 * case, octal!) are supposed to be an error inside class
2751 * ranges, but supported here for backwards compatibility.
2752 */
2753 n = JS7_UNDEC(c);
2754 c = *src;
2755 if ('0' <= c && c <= '7') {
2756 src++;
2757 n = 8 * n + JS7_UNDEC(c);
2758 c = *src;
2759 if ('0' <= c && c <= '7') {
2760 src++;
2761 i = 8 * n + JS7_UNDEC(c);
2762 if (i <= 0377)
2763 n = i;
2764 else
2765 src--;
2766 }
2767 }
2768 thisCh = (jschar)n;
2769 break;
2770
2771 case 'd':
2772 AddCharacterRangeToCharSet(charSet, '0', '9');
2773 continue; /* don't need range processing */
2774 case 'D':
2775 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2776 AddCharacterRangeToCharSet(charSet,
2777 (jschar)('9' + 1),
2778 (jschar)charSet->length);
2779 continue;
2780 case 's':
2781 AddCharacterRanges(charSet, WhiteSpaceRanges,
2782 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
2783 continue;
2784 case 'S':
2785 AddInvertedCharacterRanges(charSet, WhiteSpaceRanges,
2786 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
2787 continue;
2788 case 'w':
2789 AddCharacterRanges(charSet, WordRanges,
2790 WordRanges + JS_ARRAY_LENGTH(WordRanges));
2791 continue;
2792 case 'W':
2793 AddInvertedCharacterRanges(charSet, WordRanges,
2794 WordRanges + JS_ARRAY_LENGTH(WordRanges));
2795 continue;
2796 default:
2797 thisCh = c;
2798 break;
2799
2800 }
2801 break;
2802
2803 default:
2804 thisCh = *src++;
2805 break;
2806
2807 }
2808 if (inRange) {
2809 if (gData->regexp->flags & JSREG_FOLD) {
2810 int i;
2811
2812 JS_ASSERT(rangeStart <= thisCh);
2813 for (i = rangeStart; i <= thisCh; i++) {
2814 jschar uch, dch;
2815
2816 AddCharacterToCharSet(charSet, i);
2817 uch = upcase(i);
2818 dch = downcase(i);
2819 if (i != uch)
2820 AddCharacterToCharSet(charSet, uch);
2821 if (i != dch)
2822 AddCharacterToCharSet(charSet, dch);
2823 }
2824 } else {
2825 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2826 }
2827 inRange = JS_FALSE;
2828 } else {
2829 if (gData->regexp->flags & JSREG_FOLD) {
2830 AddCharacterToCharSet(charSet, upcase(thisCh));
2831 AddCharacterToCharSet(charSet, downcase(thisCh));
2832 } else {
2833 AddCharacterToCharSet(charSet, thisCh);
2834 }
2835 if (src < end - 1) {
2836 if (*src == '-') {
2837 ++src;
2838 inRange = JS_TRUE;
2839 rangeStart = thisCh;
2840 }
2841 }
2842 }
2843 }
2844 return JS_TRUE;
2845 }
2846
2847 void
2848 js_DestroyRegExp(JSContext *cx, JSRegExp *re)
2849 {
2850 if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) {
2851 #ifdef JS_TRACER
2852 /* Don't reuse this compiled code for some new regexp at same addr. */
2853 Fragment* fragment = JS_TRACE_MONITOR(cx).reFragmento->getLoop(re);
2854 if (fragment)
2855 fragment->blacklist();
2856 #endif
2857 if (re->classList) {
2858 uintN i;
2859 for (i = 0; i < re->classCount; i++) {
2860 if (re->classList[i].converted)
2861 JS_free(cx, re->classList[i].u.bits);
2862 re->classList[i].u.bits = NULL;
2863 }
2864 JS_free(cx, re->classList);
2865 }
2866 JS_free(cx, re);
2867 }
2868 }
2869
2870 static JSBool
2871 ReallocStateStack(REGlobalData *gData)
2872 {
2873 size_t limit = gData->stateStackLimit;
2874 size_t sz = sizeof(REProgState) * limit;
2875
2876 JS_ARENA_GROW_CAST(gData->stateStack, REProgState *,
2877 &gData->cx->regexpPool, sz, sz);
2878 if (!gData->stateStack) {
2879 js_ReportOutOfScriptQuota(gData->cx);
2880 gData->ok = JS_FALSE;
2881 return JS_FALSE;
2882 }
2883 gData->stateStackLimit = limit + limit;
2884 return JS_TRUE;
2885 }
2886
2887 #define PUSH_STATE_STACK(data) \
2888 JS_BEGIN_MACRO \
2889 ++(data)->stateStackTop; \
2890 if ((data)->stateStackTop == (data)->stateStackLimit && \
2891 !ReallocStateStack((data))) { \
2892 return NULL; \
2893 } \
2894 JS_END_MACRO
2895
2896 /*
2897 * Apply the current op against the given input to see if it's going to match
2898 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2899 * true, then update the current state's cp. Always update startpc to the next
2900 * op.
2901 */
2902 static JS_ALWAYS_INLINE REMatchState *
2903 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2904 jsbytecode **startpc, JSBool updatecp)
2905 {
2906 REMatchState *result = NULL;
2907 jschar matchCh;
2908 size_t parenIndex;
2909 size_t offset, length, index;
2910 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2911 jschar *source;
2912 const jschar *startcp = x->cp;
2913 jschar ch;
2914 RECharSet *charSet;
2915
2916 #ifdef REGEXP_DEBUG
2917 const char *opname = reop_names[op];
2918 re_debug("\n%06d: %*s%s", pc - gData->regexp->program,
2919 gData->stateStackTop * 2, "", opname);
2920 #endif
2921 switch (op) {
2922 case REOP_EMPTY:
2923 result = x;
2924 break;
2925 case REOP_BOL:
2926 if (x->cp != gData->cpbegin) {
2927 if (!gData->cx->regExpStatics.multiline &&
2928 !(gData->regexp->flags & JSREG_MULTILINE)) {
2929 break;
2930 }
2931 if (!RE_IS_LINE_TERM(x->cp[-1]))
2932 break;
2933 }
2934 result = x;
2935 break;
2936 case REOP_EOL:
2937 if (x->cp != gData->cpend) {
2938 if (!gData->cx->regExpStatics.multiline &&
2939 !(gData->regexp->flags & JSREG_MULTILINE)) {
2940 break;
2941 }
2942 if (!RE_IS_LINE_TERM(*x->cp))
2943 break;
2944 }
2945 result = x;
2946 break;
2947 case REOP_WBDRY:
2948 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2949 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2950 result = x;
2951 }
2952 break;
2953 case REOP_WNONBDRY:
2954 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2955 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2956 result = x;
2957 }
2958 break;
2959 case REOP_DOT:
2960 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2961 result = x;
2962 result->cp++;
2963 }
2964 break;
2965 case REOP_DIGIT:
2966 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2967 result = x;
2968 result->cp++;
2969 }
2970 break;
2971 case REOP_NONDIGIT:
2972 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2973 result = x;
2974 result->cp++;
2975 }
2976 break;
2977 case REOP_ALNUM:
2978 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2979 result = x;
2980 result->cp++;
2981 }
2982 break;
2983 case REOP_NONALNUM:
2984 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2985 result = x;
2986 result->cp++;
2987 }
2988 break;
2989 case REOP_SPACE:
2990 if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) {
2991 result = x;
2992 result->cp++;
2993 }
2994 break;
2995 case REOP_NONSPACE:
2996 if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) {
2997 result = x;
2998 result->cp++;
2999 }
3000 break;
3001 case REOP_BACKREF:
3002 pc = ReadCompactIndex(pc, &parenIndex);
3003 JS_ASSERT(parenIndex < gData->regexp->parenCount);
3004 result = BackrefMatcher(gData, x, parenIndex);
3005 break;
3006 case REOP_FLAT:
3007 pc = ReadCompactIndex(pc, &offset);
3008 JS_ASSERT(offset < JSSTRING_LENGTH(gData->regexp->source));
3009 pc = ReadCompactIndex(pc, &length);
3010 JS_ASSERT(1 <= length);
3011 JS_ASSERT(length <= JSSTRING_LENGTH(gData->regexp->source) - offset);
3012 if (length <= (size_t)(gData->cpend - x->cp)) {
3013 source = JSSTRING_CHARS(gData->regexp->source) + offset;
3014 re_debug_chars(source, length);
3015 for (index = 0; index != length; index++) {
3016 if (source[index] != x->cp[index])
3017 return NULL;
3018 }
3019 x->cp += length;
3020 result = x;
3021 }
3022 break;
3023 case REOP_FLAT1:
3024 matchCh = *pc++;
3025 re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp);
3026 if (x->cp != gData->cpend && *x->cp == matchCh) {
3027 result = x;
3028 result->cp++;
3029 }
3030 break;
3031 case REOP_FLATi:
3032 pc = ReadCompactIndex(pc, &offset);
3033 JS_ASSERT(offset < JSSTRING_LENGTH(gData->regexp->source));
3034 pc = ReadCompactIndex(pc, &length);
3035 JS_ASSERT(1 <= length);
3036 JS_ASSERT(length <= JSSTRING_LENGTH(gData->regexp->source) - offset);
3037 source = JSSTRING_CHARS(gData->regexp->source);
3038 result = FlatNIMatcher(gData, x, source + offset, length);
3039 break;
3040 case REOP_FLAT1i:
3041 matchCh = *pc++;
3042 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
3043 result = x;
3044 result->cp++;
3045 }
3046 break;
3047 case REOP_UCFLAT1:
3048 matchCh = GET_ARG(pc);
3049 re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp);
3050 pc += ARG_LEN;
3051 if (x->cp != gData->cpend && *x->cp == matchCh) {
3052 result = x;
3053 result->cp++;
3054 }
3055 break;
3056 case REOP_UCFLAT1i:
3057 matchCh = GET_ARG(pc);
3058 pc += ARG_LEN;
3059 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
3060 result = x;
3061 result->cp++;
3062 }
3063 break;
3064 case REOP_CLASS:
3065 pc = ReadCompactIndex(pc, &index);
3066 JS_ASSERT(index < gData->regexp->classCount);
3067 if (x->cp != gData->cpend) {
3068 charSet = &gData->regexp->classList[index];
3069 JS_ASSERT(charSet->converted);
3070 ch = *x->cp;
3071 index = ch >> 3;
3072 if (charSet->length != 0 &&
3073 ch <= charSet->length &&
3074 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
3075 result = x;
3076 result->cp++;
3077 }
3078 }
3079 break;
3080 case REOP_NCLASS:
3081 pc = ReadCompactIndex(pc, &index);
3082 JS_ASSERT(index < gData->regexp->classCount);
3083 if (x->cp != gData->cpend) {
3084 charSet = &gData->regexp->classList[index];
3085 JS_ASSERT(charSet->converted);
3086 ch = *x->cp;
3087 index = ch >> 3;
3088 if (charSet->length == 0 ||
3089 ch > charSet->length ||
3090 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
3091 result = x;
3092 result->cp++;
3093 }
3094 }
3095 break;
3096
3097 default:
3098 JS_ASSERT(JS_FALSE);
3099 }
3100 if (result) {
3101 if (!updatecp)
3102 x->cp = startcp;
3103 *startpc = pc;
3104 re_debug(" * ");
3105 return result;
3106 }
3107 x->cp = startcp;
3108 return NULL;
3109 }
3110
3111 static JS_ALWAYS_INLINE REMatchState *
3112 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
3113 {
3114 REMatchState *result = NULL;
3115 REBackTrackData *backTrackData;
3116 jsbytecode *nextpc, *testpc;
3117 REOp nextop;
3118 RECapture *cap;
3119 REProgState *curState;
3120 const jschar *startcp;
3121 size_t parenIndex, k;
3122 size_t parenSoFar = 0;
3123
3124 jschar matchCh1, matchCh2;
3125 RECharSet *charSet;
3126
3127 JSBool anchor;
3128 jsbytecode *pc = gData->regexp->program;
3129 REOp op = (REOp) *pc++;
3130
3131 /*
3132 * If the first node is a simple match, step the index into the string
3133 * until that match is made, or fail if it can't be found at all.
3134 */
3135 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
3136 anchor = JS_FALSE;
3137 while (x->cp <= gData->cpend) {
3138 nextpc = pc; /* reset back to start each time */
3139 result = SimpleMatch(gData, x, op, &nextpc, JS_TRUE);
3140 if (result) {
3141 anchor = JS_TRUE;
3142 x = result;
3143 pc = nextpc; /* accept skip to next opcode */
3144 op = (REOp) *pc++;
3145 JS_ASSERT(op < REOP_LIMIT);
3146 break;
3147 }
3148 gData->skipped++;
3149 x->cp++;
3150 }
3151 if (!anchor)
3152 goto bad;
3153 }
3154
3155 for (;;) {
3156 #ifdef REGEXP_DEBUG
3157 const char *opname = reop_names[op];
3158 re_debug("\n%06d: %*s%s", pc - gData->regexp->program,
3159 gData->stateStackTop * 2, "", opname);
3160 #endif
3161 if (REOP_IS_SIMPLE(op)) {
3162 result = SimpleMatch(gData, x, op, &pc, JS_TRUE);
3163 } else {
3164 curState = &gData->stateStack[gData->stateStackTop];
3165 switch (op) {
3166 case REOP_END:
3167 goto good;
3168 case REOP_ALTPREREQ2:
3169 nextpc = pc + GET_OFFSET(pc); /* start of next op */
3170 pc += ARG_LEN;
3171 matchCh2 = GET_ARG(pc);
3172 pc += ARG_LEN;
3173 k = GET_ARG(pc);
3174 pc += ARG_LEN;
3175
3176 if (x->cp != gData->cpend) {
3177 if (*x->cp == matchCh2)
3178 goto doAlt;
3179
3180 charSet = &gData->regexp->classList[k];
3181 if (!charSet->converted && !ProcessCharSet(gData, charSet))
3182 goto bad;
3183 matchCh1 = *x->cp;
3184 k = matchCh1 >> 3;
3185 if ((charSet->length == 0 ||
3186 matchCh1 > charSet->length ||
3187 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
3188 charSet->sense) {
3189 goto doAlt;
3190 }
3191 }
3192 result = NULL;
3193 break;
3194
3195 case REOP_ALTPREREQ:
3196 nextpc = pc + GET_OFFSET(pc); /* start of next op */
3197 pc += ARG_LEN;
3198 matchCh1 = GET_ARG(pc);
3199 pc += ARG_LEN;
3200 matchCh2 = GET_ARG(pc);
3201 pc += ARG_LEN;
3202 if (x->cp == gData->cpend ||
3203 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
3204 result = NULL;
3205 break;
3206 }
3207 /* else false thru... */
3208
3209 case REOP_ALT:
3210 doAlt:
3211 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
3212 pc += ARG_LEN; /* start of this alternate */
3213 curState->parenSoFar