/[jscoverage]/trunk/js/jsregexp.cpp
ViewVC logotype

Contents of /trunk/js/jsregexp.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 460 - (show annotations)
Sat Sep 26 23:15:22 2009 UTC (9 years, 11 months ago) by siliconforks
File size: 169092 byte(s)
Upgrade to SpiderMonkey from Firefox 3.5.3.

1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set sw=4 ts=8 et tw=78:
3 *
4 * ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 *
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
11 *
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
15 * License.
16 *
17 * The Original Code is Mozilla Communicator client code, released
18 * March 31, 1998.
19 *
20 * The Initial Developer of the Original Code is
21 * Netscape Communications Corporation.
22 * Portions created by the Initial Developer are Copyright (C) 1998
23 * the Initial Developer. All Rights Reserved.
24 *
25 * Contributor(s):
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either of the GNU General Public License Version 2 or later (the "GPL"),
29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
41 /*
42 * JS regular expressions, after Perl.
43 */
44 #include "jsstddef.h"
45 #include <stdlib.h>
46 #include <string.h>
47 #include <stdarg.h>
48 #include "jstypes.h"
49 #include "jsarena.h" /* Added by JSIFY */
50 #include "jsutil.h" /* Added by JSIFY */
51 #include "jsapi.h"
52 #include "jsarray.h"
53 #include "jsatom.h"
54 #include "jsbuiltins.h"
55 #include "jscntxt.h"
56 #include "jsversion.h"
57 #include "jsfun.h"
58 #include "jsgc.h"
59 #include "jsinterp.h"
60 #include "jslock.h"
61 #include "jsnum.h"
62 #include "jsobj.h"
63 #include "jsopcode.h"
64 #include "jsregexp.h"
65 #include "jsscan.h"
66 #include "jsscope.h"
67 #include "jsstaticcheck.h"
68 #include "jsstr.h"
69
70 #ifdef JS_TRACER
71 #include "jstracer.h"
72 using namespace avmplus;
73 using namespace nanojit;
74 #endif
75
76 typedef enum REOp {
77 #define REOP_DEF(opcode, name) opcode,
78 #include "jsreops.tbl"
79 #undef REOP_DEF
80 REOP_LIMIT /* META: no operator >= to this */
81 } REOp;
82
83 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
84
85 #ifdef REGEXP_DEBUG
86 const char *reop_names[] = {
87 #define REOP_DEF(opcode, name) name,
88 #include "jsreops.tbl"
89 #undef REOP_DEF
90 NULL
91 };
92 #endif
93
94 #ifdef __GNUC__
95 static int
96 re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2)));
97 #endif
98
99 #ifdef REGEXP_DEBUG
100 static int
101 re_debug(const char *fmt, ...)
102 {
103 va_list ap;
104 int retval;
105
106 va_start(ap, fmt);
107 retval = vprintf(fmt, ap);
108 va_end(ap);
109 return retval;
110 }
111
112 static void
113 re_debug_chars(const jschar *chrs, size_t length)
114 {
115 int i = 0;
116
117 printf(" \"");
118 while (*chrs && i++ < length) {
119 putchar((char)*chrs++);
120 }
121 printf("\"");
122 }
123 #else /* !REGEXP_DEBUG */
124 /* This should be optimized to a no-op by our tier-1 compilers. */
125 static int
126 re_debug(const char *fmt, ...)
127 {
128 return 0;
129 }
130
131 static void
132 re_debug_chars(const jschar *chrs, size_t length)
133 {
134 }
135 #endif /* !REGEXP_DEBUG */
136
137 struct RENode {
138 REOp op; /* r.e. op bytecode */
139 RENode *next; /* next in concatenation order */
140 void *kid; /* first operand */
141 union {
142 void *kid2; /* second operand */
143 jsint num; /* could be a number */
144 size_t parenIndex; /* or a parenthesis index */
145 struct { /* or a quantifier range */
146 uintN min;
147 uintN max;
148 JSPackedBool greedy;
149 } range;
150 struct { /* or a character class */
151 size_t startIndex;
152 size_t kidlen; /* length of string at kid, in jschars */
153 size_t index; /* index into class list */
154 uint16 bmsize; /* bitmap size, based on max char code */
155 JSPackedBool sense;
156 } ucclass;
157 struct { /* or a literal sequence */
158 jschar chr; /* of one character */
159 size_t length; /* or many (via the kid) */
160 } flat;
161 struct {
162 RENode *kid2; /* second operand from ALT */
163 jschar ch1; /* match char for ALTPREREQ */
164 jschar ch2; /* ditto, or class index for ALTPREREQ2 */
165 } altprereq;
166 } u;
167 };
168
169 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
170 ((c >= 'a') && (c <= 'z')) )
171 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
172 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
173
174 #define CLASS_CACHE_SIZE 4
175
176 typedef struct CompilerState {
177 JSContext *context;
178 JSTokenStream *tokenStream; /* For reporting errors */
179 const jschar *cpbegin;
180 const jschar *cpend;
181 const jschar *cp;
182 size_t parenCount;
183 size_t classCount; /* number of [] encountered */
184 size_t treeDepth; /* maximum depth of parse tree */
185 size_t progLength; /* estimated bytecode length */
186 RENode *result;
187 size_t classBitmapsMem; /* memory to hold all class bitmaps */
188 struct {
189 const jschar *start; /* small cache of class strings */
190 size_t length; /* since they're often the same */
191 size_t index;
192 } classCache[CLASS_CACHE_SIZE];
193 uint16 flags;
194 } CompilerState;
195
196 typedef struct EmitStateStackEntry {
197 jsbytecode *altHead; /* start of REOP_ALT* opcode */
198 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
199 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
200 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
201 RENode *continueNode; /* original REOP_ALT* node being stacked */
202 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
203 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
204 avoid 16-bit unsigned offset overflow */
205 } EmitStateStackEntry;
206
207 /*
208 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
209 * the getters and setters take the pc of the offset, not of the opcode before
210 * the offset.
211 */
212 #define ARG_LEN 2
213 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
214 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
215 (pc)[1] = (jsbytecode) (arg))
216
217 #define OFFSET_LEN ARG_LEN
218 #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
219 #define GET_OFFSET(pc) GET_ARG(pc)
220
221 /*
222 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
223 * For sanity, we limit it to 2^24 bytes.
224 */
225 #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
226
227 /*
228 * The maximum memory that can be allocated for class bitmaps.
229 * For sanity, we limit it to 2^24 bytes.
230 */
231 #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
232
233 /*
234 * Functions to get size and write/read bytecode that represent small indexes
235 * compactly.
236 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
237 * indicates that the following byte brings more bits to the index. Otherwise
238 * this is the last byte in the index bytecode representing highest index bits.
239 */
240 static size_t
241 GetCompactIndexWidth(size_t index)
242 {
243 size_t width;
244
245 for (width = 1; (index >>= 7) != 0; ++width) { }
246 return width;
247 }
248
249 static JS_ALWAYS_INLINE jsbytecode *
250 WriteCompactIndex(jsbytecode *pc, size_t index)
251 {
252 size_t next;
253
254 while ((next = index >> 7) != 0) {
255 *pc++ = (jsbytecode)(index | 0x80);
256 index = next;
257 }
258 *pc++ = (jsbytecode)index;
259 return pc;
260 }
261
262 static JS_ALWAYS_INLINE jsbytecode *
263 ReadCompactIndex(jsbytecode *pc, size_t *result)
264 {
265 size_t nextByte;
266
267 nextByte = *pc++;
268 if ((nextByte & 0x80) == 0) {
269 /*
270 * Short-circuit the most common case when compact index <= 127.
271 */
272 *result = nextByte;
273 } else {
274 size_t shift = 7;
275 *result = 0x7F & nextByte;
276 do {
277 nextByte = *pc++;
278 *result |= (nextByte & 0x7F) << shift;
279 shift += 7;
280 } while ((nextByte & 0x80) != 0);
281 }
282 return pc;
283 }
284
285 typedef struct RECapture {
286 ptrdiff_t index; /* start of contents, -1 for empty */
287 size_t length; /* length of capture */
288 } RECapture;
289
290 typedef struct REMatchState {
291 const jschar *cp;
292 RECapture parens[1]; /* first of 're->parenCount' captures,
293 allocated at end of this struct */
294 } REMatchState;
295
296 struct REBackTrackData;
297
298 typedef struct REProgState {
299 jsbytecode *continue_pc; /* current continuation data */
300 jsbytecode continue_op;
301 ptrdiff_t index; /* progress in text */
302 size_t parenSoFar; /* highest indexed paren started */
303 union {
304 struct {
305 uintN min; /* current quantifier limits */
306 uintN max;
307 } quantifier;
308 struct {
309 size_t top; /* backtrack stack state */
310 size_t sz;
311 } assertion;
312 } u;
313 } REProgState;
314
315 typedef struct REBackTrackData {
316 size_t sz; /* size of previous stack entry */
317 jsbytecode *backtrack_pc; /* where to backtrack to */
318 jsbytecode backtrack_op;
319 const jschar *cp; /* index in text of match at backtrack */
320 size_t parenIndex; /* start index of saved paren contents */
321 size_t parenCount; /* # of saved paren contents */
322 size_t saveStateStackTop; /* number of parent states */
323 /* saved parent states follow */
324 /* saved paren contents follow */
325 } REBackTrackData;
326
327 #define INITIAL_STATESTACK 100
328 #define INITIAL_BACKTRACK 8000
329
330 typedef struct REGlobalData {
331 JSContext *cx;
332 JSRegExp *regexp; /* the RE in execution */
333 JSBool ok; /* runtime error (out_of_memory only?) */
334 size_t start; /* offset to start at */
335 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
336 const jschar *cpbegin; /* text base address */
337 const jschar *cpend; /* text limit address */
338
339 REProgState *stateStack; /* stack of state of current parents */
340 size_t stateStackTop;
341 size_t stateStackLimit;
342
343 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
344 REBackTrackData *backTrackSP;
345 size_t backTrackStackSize;
346 size_t cursz; /* size of current stack entry */
347 size_t backTrackCount; /* how many times we've backtracked */
348 size_t backTrackLimit; /* upper limit on backtrack states */
349 } REGlobalData;
350
351 /*
352 * 1. If IgnoreCase is false, return ch.
353 * 2. Let u be ch converted to upper case as if by calling
354 * String.prototype.toUpperCase on the one-character string ch.
355 * 3. If u does not consist of a single character, return ch.
356 * 4. Let cu be u's character.
357 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
358 * code point value is less than decimal 128, then return ch.
359 * 6. Return cu.
360 */
361 static JS_ALWAYS_INLINE uintN
362 upcase(uintN ch)
363 {
364 uintN cu;
365
366 JS_ASSERT((uintN) (jschar) ch == ch);
367 if (ch < 128) {
368 if (ch - (uintN) 'a' <= (uintN) ('z' - 'a'))
369 ch -= (uintN) ('a' - 'A');
370 return ch;
371 }
372
373 cu = JS_TOUPPER(ch);
374 return (cu < 128) ? ch : cu;
375 }
376
377 static JS_ALWAYS_INLINE uintN
378 downcase(uintN ch)
379 {
380 JS_ASSERT((uintN) (jschar) ch == ch);
381 if (ch < 128) {
382 if (ch - (uintN) 'A' <= (uintN) ('Z' - 'A'))
383 ch += (uintN) ('a' - 'A');
384 return ch;
385 }
386
387 return JS_TOLOWER(ch);
388 }
389
390 /* Construct and initialize an RENode, returning NULL for out-of-memory */
391 static RENode *
392 NewRENode(CompilerState *state, REOp op)
393 {
394 JSContext *cx;
395 RENode *ren;
396
397 cx = state->context;
398 JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
399 if (!ren) {
400 js_ReportOutOfScriptQuota(cx);
401 return NULL;
402 }
403 ren->op = op;
404 ren->next = NULL;
405 ren->kid = NULL;
406 return ren;
407 }
408
409 /*
410 * Validates and converts hex ascii value.
411 */
412 static JSBool
413 isASCIIHexDigit(jschar c, uintN *digit)
414 {
415 uintN cv = c;
416
417 if (cv < '0')
418 return JS_FALSE;
419 if (cv <= '9') {
420 *digit = cv - '0';
421 return JS_TRUE;
422 }
423 cv |= 0x20;
424 if (cv >= 'a' && cv <= 'f') {
425 *digit = cv - 'a' + 10;
426 return JS_TRUE;
427 }
428 return JS_FALSE;
429 }
430
431
432 typedef struct {
433 REOp op;
434 const jschar *errPos;
435 size_t parenIndex;
436 } REOpData;
437
438 static JSBool
439 ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber,
440 const jschar *arg)
441 {
442 if (state->tokenStream) {
443 return js_ReportCompileErrorNumber(state->context, state->tokenStream,
444 NULL, JSREPORT_UC | flags,
445 errorNumber, arg);
446 }
447 return JS_ReportErrorFlagsAndNumberUC(state->context, flags,
448 js_GetErrorMessage, NULL,
449 errorNumber, arg);
450 }
451
452 static JSBool
453 ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber)
454 {
455 return ReportRegExpErrorHelper(state, flags, errorNumber, NULL);
456 }
457
458 /*
459 * Process the op against the two top operands, reducing them to a single
460 * operand in the penultimate slot. Update progLength and treeDepth.
461 */
462 static JSBool
463 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
464 intN operandSP)
465 {
466 RENode *result;
467
468 switch (opData->op) {
469 case REOP_ALT:
470 result = NewRENode(state, REOP_ALT);
471 if (!result)
472 return JS_FALSE;
473 result->kid = operandStack[operandSP - 2];
474 result->u.kid2 = operandStack[operandSP - 1];
475 operandStack[operandSP - 2] = result;
476
477 if (state->treeDepth == TREE_DEPTH_MAX) {
478 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
479 return JS_FALSE;
480 }
481 ++state->treeDepth;
482
483 /*
484 * Look at both alternates to see if there's a FLAT or a CLASS at
485 * the start of each. If so, use a prerequisite match.
486 */
487 if (((RENode *) result->kid)->op == REOP_FLAT &&
488 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
489 (state->flags & JSREG_FOLD) == 0) {
490 result->op = REOP_ALTPREREQ;
491 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
492 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
493 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
494 JUMP, <end> ... ENDALT */
495 state->progLength += 13;
496 }
497 else
498 if (((RENode *) result->kid)->op == REOP_CLASS &&
499 ((RENode *) result->kid)->u.ucclass.index < 256 &&
500 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
501 (state->flags & JSREG_FOLD) == 0) {
502 result->op = REOP_ALTPREREQ2;
503 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
504 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
505 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
506 JUMP, <end> ... ENDALT */
507 state->progLength += 13;
508 }
509 else
510 if (((RENode *) result->kid)->op == REOP_FLAT &&
511 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
512 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
513 (state->flags & JSREG_FOLD) == 0) {
514 result->op = REOP_ALTPREREQ2;
515 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
516 result->u.altprereq.ch2 =
517 ((RENode *) result->u.kid2)->u.ucclass.index;
518 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
519 JUMP, <end> ... ENDALT */
520 state->progLength += 13;
521 }
522 else {
523 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
524 state->progLength += 7;
525 }
526 break;
527
528 case REOP_CONCAT:
529 result = operandStack[operandSP - 2];
530 while (result->next)
531 result = result->next;
532 result->next = operandStack[operandSP - 1];
533 break;
534
535 case REOP_ASSERT:
536 case REOP_ASSERT_NOT:
537 case REOP_LPARENNON:
538 case REOP_LPAREN:
539 /* These should have been processed by a close paren. */
540 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
541 opData->errPos);
542 return JS_FALSE;
543
544 default:;
545 }
546 return JS_TRUE;
547 }
548
549 /*
550 * Parser forward declarations.
551 */
552 static JSBool ParseTerm(CompilerState *state);
553 static JSBool ParseQuantifier(CompilerState *state);
554 static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues);
555
556 /*
557 * Top-down regular expression grammar, based closely on Perl4.
558 *
559 * regexp: altern A regular expression is one or more
560 * altern '|' regexp alternatives separated by vertical bar.
561 */
562 #define INITIAL_STACK_SIZE 128
563
564 static JSBool
565 ParseRegExp(CompilerState *state)
566 {
567 size_t parenIndex;
568 RENode *operand;
569 REOpData *operatorStack;
570 RENode **operandStack;
571 REOp op;
572 intN i;
573 JSBool result = JS_FALSE;
574
575 intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
576 intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
577
578 /* Watch out for empty regexp */
579 if (state->cp == state->cpend) {
580 state->result = NewRENode(state, REOP_EMPTY);
581 return (state->result != NULL);
582 }
583
584 operatorStack = (REOpData *)
585 JS_malloc(state->context, sizeof(REOpData) * operatorStackSize);
586 if (!operatorStack)
587 return JS_FALSE;
588
589 operandStack = (RENode **)
590 JS_malloc(state->context, sizeof(RENode *) * operandStackSize);
591 if (!operandStack)
592 goto out;
593
594 for (;;) {
595 parenIndex = state->parenCount;
596 if (state->cp == state->cpend) {
597 /*
598 * If we are at the end of the regexp and we're short one or more
599 * operands, the regexp must have the form /x|/ or some such, with
600 * left parentheses making us short more than one operand.
601 */
602 if (operatorSP >= operandSP) {
603 operand = NewRENode(state, REOP_EMPTY);
604 if (!operand)
605 goto out;
606 goto pushOperand;
607 }
608 } else {
609 switch (*state->cp) {
610 case '(':
611 ++state->cp;
612 if (state->cp + 1 < state->cpend &&
613 *state->cp == '?' &&
614 (state->cp[1] == '=' ||
615 state->cp[1] == '!' ||
616 state->cp[1] == ':')) {
617 switch (state->cp[1]) {
618 case '=':
619 op = REOP_ASSERT;
620 /* ASSERT, <next>, ... ASSERTTEST */
621 state->progLength += 4;
622 break;
623 case '!':
624 op = REOP_ASSERT_NOT;
625 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
626 state->progLength += 4;
627 break;
628 default:
629 op = REOP_LPARENNON;
630 break;
631 }
632 state->cp += 2;
633 } else {
634 op = REOP_LPAREN;
635 /* LPAREN, <index>, ... RPAREN, <index> */
636 state->progLength
637 += 2 * (1 + GetCompactIndexWidth(parenIndex));
638 state->parenCount++;
639 if (state->parenCount == 65535) {
640 ReportRegExpError(state, JSREPORT_ERROR,
641 JSMSG_TOO_MANY_PARENS);
642 goto out;
643 }
644 }
645 goto pushOperator;
646
647 case ')':
648 /*
649 * If there's no stacked open parenthesis, throw syntax error.
650 */
651 for (i = operatorSP - 1; ; i--) {
652 if (i < 0) {
653 ReportRegExpError(state, JSREPORT_ERROR,
654 JSMSG_UNMATCHED_RIGHT_PAREN);
655 goto out;
656 }
657 if (operatorStack[i].op == REOP_ASSERT ||
658 operatorStack[i].op == REOP_ASSERT_NOT ||
659 operatorStack[i].op == REOP_LPARENNON ||
660 operatorStack[i].op == REOP_LPAREN) {
661 break;
662 }
663 }
664 /* FALL THROUGH */
665
666 case '|':
667 /* Expected an operand before these, so make an empty one */
668 operand = NewRENode(state, REOP_EMPTY);
669 if (!operand)
670 goto out;
671 goto pushOperand;
672
673 default:
674 if (!ParseTerm(state))
675 goto out;
676 operand = state->result;
677 pushOperand:
678 if (operandSP == operandStackSize) {
679 RENode **tmp;
680 operandStackSize += operandStackSize;
681 tmp = (RENode **)
682 JS_realloc(state->context, operandStack,
683 sizeof(RENode *) * operandStackSize);
684 if (!tmp)
685 goto out;
686 operandStack = tmp;
687 }
688 operandStack[operandSP++] = operand;
689 break;
690 }
691 }
692
693 /* At the end; process remaining operators. */
694 restartOperator:
695 if (state->cp == state->cpend) {
696 while (operatorSP) {
697 --operatorSP;
698 if (!ProcessOp(state, &operatorStack[operatorSP],
699 operandStack, operandSP))
700 goto out;
701 --operandSP;
702 }
703 JS_ASSERT(operandSP == 1);
704 state->result = operandStack[0];
705 result = JS_TRUE;
706 goto out;
707 }
708
709 switch (*state->cp) {
710 case '|':
711 /* Process any stacked 'concat' operators */
712 ++state->cp;
713 while (operatorSP &&
714 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
715 --operatorSP;
716 if (!ProcessOp(state, &operatorStack[operatorSP],
717 operandStack, operandSP)) {
718 goto out;
719 }
720 --operandSP;
721 }
722 op = REOP_ALT;
723 goto pushOperator;
724
725 case ')':
726 /*
727 * If there's no stacked open parenthesis, throw syntax error.
728 */
729 for (i = operatorSP - 1; ; i--) {
730 if (i < 0) {
731 ReportRegExpError(state, JSREPORT_ERROR,
732 JSMSG_UNMATCHED_RIGHT_PAREN);
733 goto out;
734 }
735 if (operatorStack[i].op == REOP_ASSERT ||
736 operatorStack[i].op == REOP_ASSERT_NOT ||
737 operatorStack[i].op == REOP_LPARENNON ||
738 operatorStack[i].op == REOP_LPAREN) {
739 break;
740 }
741 }
742 ++state->cp;
743
744 /* Process everything on the stack until the open parenthesis. */
745 for (;;) {
746 JS_ASSERT(operatorSP);
747 --operatorSP;
748 switch (operatorStack[operatorSP].op) {
749 case REOP_ASSERT:
750 case REOP_ASSERT_NOT:
751 case REOP_LPAREN:
752 operand = NewRENode(state, operatorStack[operatorSP].op);
753 if (!operand)
754 goto out;
755 operand->u.parenIndex =
756 operatorStack[operatorSP].parenIndex;
757 JS_ASSERT(operandSP);
758 operand->kid = operandStack[operandSP - 1];
759 operandStack[operandSP - 1] = operand;
760 if (state->treeDepth == TREE_DEPTH_MAX) {
761 ReportRegExpError(state, JSREPORT_ERROR,
762 JSMSG_REGEXP_TOO_COMPLEX);
763 goto out;
764 }
765 ++state->treeDepth;
766 /* FALL THROUGH */
767
768 case REOP_LPARENNON:
769 state->result = operandStack[operandSP - 1];
770 if (!ParseQuantifier(state))
771 goto out;
772 operandStack[operandSP - 1] = state->result;
773 goto restartOperator;
774 default:
775 if (!ProcessOp(state, &operatorStack[operatorSP],
776 operandStack, operandSP))
777 goto out;
778 --operandSP;
779 break;
780 }
781 }
782 break;
783
784 case '{':
785 {
786 const jschar *errp = state->cp;
787
788 if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) {
789 /*
790 * This didn't even scan correctly as a quantifier, so we should
791 * treat it as flat.
792 */
793 op = REOP_CONCAT;
794 goto pushOperator;
795 }
796
797 state->cp = errp;
798 /* FALL THROUGH */
799 }
800
801 case '+':
802 case '*':
803 case '?':
804 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
805 state->cp);
806 result = JS_FALSE;
807 goto out;
808
809 default:
810 /* Anything else is the start of the next term. */
811 op = REOP_CONCAT;
812 pushOperator:
813 if (operatorSP == operatorStackSize) {
814 REOpData *tmp;
815 operatorStackSize += operatorStackSize;
816 tmp = (REOpData *)
817 JS_realloc(state->context, operatorStack,
818 sizeof(REOpData) * operatorStackSize);
819 if (!tmp)
820 goto out;
821 operatorStack = tmp;
822 }
823 operatorStack[operatorSP].op = op;
824 operatorStack[operatorSP].errPos = state->cp;
825 operatorStack[operatorSP++].parenIndex = parenIndex;
826 break;
827 }
828 }
829 out:
830 if (operatorStack)
831 JS_free(state->context, operatorStack);
832 if (operandStack)
833 JS_free(state->context, operandStack);
834 return result;
835 }
836
837 /*
838 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
839 * its being on the stack, and to propagate errors to its callers.
840 */
841 #define JSREG_FIND_PAREN_COUNT 0x8000
842 #define JSREG_FIND_PAREN_ERROR 0x4000
843
844 /*
845 * Magic return value from FindParenCount and GetDecimalValue, to indicate
846 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
847 * its findMax parameter is non-null.
848 */
849 #define OVERFLOW_VALUE ((uintN)-1)
850
851 static uintN
852 FindParenCount(CompilerState *state)
853 {
854 CompilerState temp;
855 int i;
856
857 if (state->flags & JSREG_FIND_PAREN_COUNT)
858 return OVERFLOW_VALUE;
859
860 /*
861 * Copy state into temp, flag it so we never report an invalid backref,
862 * and reset its members to parse the entire regexp. This is obviously
863 * suboptimal, but GetDecimalValue calls us only if a backref appears to
864 * refer to a forward parenthetical, which is rare.
865 */
866 temp = *state;
867 temp.flags |= JSREG_FIND_PAREN_COUNT;
868 temp.cp = temp.cpbegin;
869 temp.parenCount = 0;
870 temp.classCount = 0;
871 temp.progLength = 0;
872 temp.treeDepth = 0;
873 temp.classBitmapsMem = 0;
874 for (i = 0; i < CLASS_CACHE_SIZE; i++)
875 temp.classCache[i].start = NULL;
876
877 if (!ParseRegExp(&temp)) {
878 state->flags |= JSREG_FIND_PAREN_ERROR;
879 return OVERFLOW_VALUE;
880 }
881 return temp.parenCount;
882 }
883
884 /*
885 * Extract and return a decimal value at state->cp. The initial character c
886 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
887 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
888 * state->flags to discover whether an error occurred under findMax.
889 */
890 static uintN
891 GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
892 CompilerState *state)
893 {
894 uintN value = JS7_UNDEC(c);
895 JSBool overflow = (value > max && (!findMax || value > findMax(state)));
896
897 /* The following restriction allows simpler overflow checks. */
898 JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
899 while (state->cp < state->cpend) {
900 c = *state->cp;
901 if (!JS7_ISDEC(c))
902 break;
903 value = 10 * value + JS7_UNDEC(c);
904 if (!overflow && value > max && (!findMax || value > findMax(state)))
905 overflow = JS_TRUE;
906 ++state->cp;
907 }
908 return overflow ? OVERFLOW_VALUE : value;
909 }
910
911 /*
912 * Calculate the total size of the bitmap required for a class expression.
913 */
914 static JSBool
915 CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
916 const jschar *end)
917 {
918 uintN max = 0;
919 JSBool inRange = JS_FALSE;
920 jschar c, rangeStart = 0;
921 uintN n, digit, nDigits, i;
922
923 target->u.ucclass.bmsize = 0;
924 target->u.ucclass.sense = JS_TRUE;
925
926 if (src == end)
927 return JS_TRUE;
928
929 if (*src == '^') {
930 ++src;
931 target->u.ucclass.sense = JS_FALSE;
932 }
933
934 while (src != end) {
935 JSBool canStartRange = JS_TRUE;
936 uintN localMax = 0;
937
938 switch (*src) {
939 case '\\':
940 ++src;
941 c = *src++;
942 switch (c) {
943 case 'b':
944 localMax = 0x8;
945 break;
946 case 'f':
947 localMax = 0xC;
948 break;
949 case 'n':
950 localMax = 0xA;
951 break;
952 case 'r':
953 localMax = 0xD;
954 break;
955 case 't':
956 localMax = 0x9;
957 break;
958 case 'v':
959 localMax = 0xB;
960 break;
961 case 'c':
962 if (src < end && RE_IS_LETTER(*src)) {
963 localMax = (uintN) (*src++) & 0x1F;
964 } else {
965 --src;
966 localMax = '\\';
967 }
968 break;
969 case 'x':
970 nDigits = 2;
971 goto lexHex;
972 case 'u':
973 nDigits = 4;
974 lexHex:
975 n = 0;
976 for (i = 0; (i < nDigits) && (src < end); i++) {
977 c = *src++;
978 if (!isASCIIHexDigit(c, &digit)) {
979 /*
980 * Back off to accepting the original
981 *'\' as a literal.
982 */
983 src -= i + 1;
984 n = '\\';
985 break;
986 }
987 n = (n << 4) | digit;
988 }
989 localMax = n;
990 break;
991 case 'd':
992 canStartRange = JS_FALSE;
993 if (inRange) {
994 JS_ReportErrorNumber(state->context,
995 js_GetErrorMessage, NULL,
996 JSMSG_BAD_CLASS_RANGE);
997 return JS_FALSE;
998 }
999 localMax = '9';
1000 break;
1001 case 'D':
1002 case 's':
1003 case 'S':
1004 case 'w':
1005 case 'W':
1006 canStartRange = JS_FALSE;
1007 if (inRange) {
1008 JS_ReportErrorNumber(state->context,
1009 js_GetErrorMessage, NULL,
1010 JSMSG_BAD_CLASS_RANGE);
1011 return JS_FALSE;
1012 }
1013 max = 65535;
1014
1015 /*
1016 * If this is the start of a range, ensure that it's less than
1017 * the end.
1018 */
1019 localMax = 0;
1020 break;
1021 case '0':
1022 case '1':
1023 case '2':
1024 case '3':
1025 case '4':
1026 case '5':
1027 case '6':
1028 case '7':
1029 /*
1030 * This is a non-ECMA extension - decimal escapes (in this
1031 * case, octal!) are supposed to be an error inside class
1032 * ranges, but supported here for backwards compatibility.
1033 *
1034 */
1035 n = JS7_UNDEC(c);
1036 c = *src;
1037 if ('0' <= c && c <= '7') {
1038 src++;
1039 n = 8 * n + JS7_UNDEC(c);
1040 c = *src;
1041 if ('0' <= c && c <= '7') {
1042 src++;
1043 i = 8 * n + JS7_UNDEC(c);
1044 if (i <= 0377)
1045 n = i;
1046 else
1047 src--;
1048 }
1049 }
1050 localMax = n;
1051 break;
1052
1053 default:
1054 localMax = c;
1055 break;
1056 }
1057 break;
1058 default:
1059 localMax = *src++;
1060 break;
1061 }
1062
1063 if (inRange) {
1064 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1065 if (rangeStart > localMax) {
1066 JS_ReportErrorNumber(state->context,
1067 js_GetErrorMessage, NULL,
1068 JSMSG_BAD_CLASS_RANGE);
1069 return JS_FALSE;
1070 }
1071 inRange = JS_FALSE;
1072 } else {
1073 if (canStartRange && src < end - 1) {
1074 if (*src == '-') {
1075 ++src;
1076 inRange = JS_TRUE;
1077 rangeStart = (jschar)localMax;
1078 continue;
1079 }
1080 }
1081 if (state->flags & JSREG_FOLD)
1082 rangeStart = localMax; /* one run of the uc/dc loop below */
1083 }
1084
1085 if (state->flags & JSREG_FOLD) {
1086 jschar maxch = localMax;
1087
1088 for (i = rangeStart; i <= localMax; i++) {
1089 jschar uch, dch;
1090
1091 uch = upcase(i);
1092 dch = downcase(i);
1093 maxch = JS_MAX(maxch, uch);
1094 maxch = JS_MAX(maxch, dch);
1095 }
1096 localMax = maxch;
1097 }
1098
1099 if (localMax > max)
1100 max = localMax;
1101 }
1102 target->u.ucclass.bmsize = max;
1103 return JS_TRUE;
1104 }
1105
1106 /*
1107 * item: assertion An item is either an assertion or
1108 * quantatom a quantified atom.
1109 *
1110 * assertion: '^' Assertions match beginning of string
1111 * (or line if the class static property
1112 * RegExp.multiline is true).
1113 * '$' End of string (or line if the class
1114 * static property RegExp.multiline is
1115 * true).
1116 * '\b' Word boundary (between \w and \W).
1117 * '\B' Word non-boundary.
1118 *
1119 * quantatom: atom An unquantified atom.
1120 * quantatom '{' n ',' m '}'
1121 * Atom must occur between n and m times.
1122 * quantatom '{' n ',' '}' Atom must occur at least n times.
1123 * quantatom '{' n '}' Atom must occur exactly n times.
1124 * quantatom '*' Zero or more times (same as {0,}).
1125 * quantatom '+' One or more times (same as {1,}).
1126 * quantatom '?' Zero or one time (same as {0,1}).
1127 *
1128 * any of which can be optionally followed by '?' for ungreedy
1129 *
1130 * atom: '(' regexp ')' A parenthesized regexp (what matched
1131 * can be addressed using a backreference,
1132 * see '\' n below).
1133 * '.' Matches any char except '\n'.
1134 * '[' classlist ']' A character class.
1135 * '[' '^' classlist ']' A negated character class.
1136 * '\f' Form Feed.
1137 * '\n' Newline (Line Feed).
1138 * '\r' Carriage Return.
1139 * '\t' Horizontal Tab.
1140 * '\v' Vertical Tab.
1141 * '\d' A digit (same as [0-9]).
1142 * '\D' A non-digit.
1143 * '\w' A word character, [0-9a-z_A-Z].
1144 * '\W' A non-word character.
1145 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1146 * '\S' A non-whitespace character.
1147 * '\' n A backreference to the nth (n decimal
1148 * and positive) parenthesized expression.
1149 * '\' octal An octal escape sequence (octal must be
1150 * two or three digits long, unless it is
1151 * 0 for the null character).
1152 * '\x' hex A hex escape (hex must be two digits).
1153 * '\u' unicode A unicode escape (must be four digits).
1154 * '\c' ctrl A control character, ctrl is a letter.
1155 * '\' literalatomchar Any character except one of the above
1156 * that follow '\' in an atom.
1157 * otheratomchar Any character not first among the other
1158 * atom right-hand sides.
1159 */
1160 static JSBool
1161 ParseTerm(CompilerState *state)
1162 {
1163 jschar c = *state->cp++;
1164 uintN nDigits;
1165 uintN num, tmp, n, i;
1166 const jschar *termStart;
1167
1168 switch (c) {
1169 /* assertions and atoms */
1170 case '^':
1171 state->result = NewRENode(state, REOP_BOL);
1172 if (!state->result)
1173 return JS_FALSE;
1174 state->progLength++;
1175 return JS_TRUE;
1176 case '$':
1177 state->result = NewRENode(state, REOP_EOL);
1178 if (!state->result)
1179 return JS_FALSE;
1180 state->progLength++;
1181 return JS_TRUE;
1182 case '\\':
1183 if (state->cp >= state->cpend) {
1184 /* a trailing '\' is an error */
1185 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1186 return JS_FALSE;
1187 }
1188 c = *state->cp++;
1189 switch (c) {
1190 /* assertion escapes */
1191 case 'b' :
1192 state->result = NewRENode(state, REOP_WBDRY);
1193 if (!state->result)
1194 return JS_FALSE;
1195 state->progLength++;
1196 return JS_TRUE;
1197 case 'B':
1198 state->result = NewRENode(state, REOP_WNONBDRY);
1199 if (!state->result)
1200 return JS_FALSE;
1201 state->progLength++;
1202 return JS_TRUE;
1203 /* Decimal escape */
1204 case '0':
1205 /* Give a strict warning. See also the note below. */
1206 if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT,
1207 JSMSG_INVALID_BACKREF)) {
1208 return JS_FALSE;
1209 }
1210 doOctal:
1211 num = 0;
1212 while (state->cp < state->cpend) {
1213 c = *state->cp;
1214 if (c < '0' || '7' < c)
1215 break;
1216 state->cp++;
1217 tmp = 8 * num + (uintN)JS7_UNDEC(c);
1218 if (tmp > 0377)
1219 break;
1220 num = tmp;
1221 }
1222 c = (jschar)num;
1223 doFlat:
1224 state->result = NewRENode(state, REOP_FLAT);
1225 if (!state->result)
1226 return JS_FALSE;
1227 state->result->u.flat.chr = c;
1228 state->result->u.flat.length = 1;
1229 state->progLength += 3;
1230 break;
1231 case '1':
1232 case '2':
1233 case '3':
1234 case '4':
1235 case '5':
1236 case '6':
1237 case '7':
1238 case '8':
1239 case '9':
1240 termStart = state->cp - 1;
1241 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1242 if (state->flags & JSREG_FIND_PAREN_ERROR)
1243 return JS_FALSE;
1244 if (num == OVERFLOW_VALUE) {
1245 /* Give a strict mode warning. */
1246 if (!ReportRegExpError(state,
1247 JSREPORT_WARNING | JSREPORT_STRICT,
1248 (c >= '8')
1249 ? JSMSG_INVALID_BACKREF
1250 : JSMSG_BAD_BACKREF)) {
1251 return JS_FALSE;
1252 }
1253
1254 /*
1255 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1256 * error here. However, for compatibility with IE, we treat the
1257 * whole backref as flat if the first character in it is not a
1258 * valid octal character, and as an octal escape otherwise.
1259 */
1260 state->cp = termStart;
1261 if (c >= '8') {
1262 /* Treat this as flat. termStart - 1 is the \. */
1263 c = '\\';
1264 goto asFlat;
1265 }
1266
1267 /* Treat this as an octal escape. */
1268 goto doOctal;
1269 }
1270
1271 /*
1272 * When FindParenCount calls the regex parser recursively (to find
1273 * the number of backrefs) num can be arbitrary and the maximum
1274 * supported number of backrefs does not bound it.
1275 */
1276 JS_ASSERT_IF(!(state->flags & JSREG_FIND_PAREN_COUNT),
1277 1 <= num && num <= 0x10000);
1278 state->result = NewRENode(state, REOP_BACKREF);
1279 if (!state->result)
1280 return JS_FALSE;
1281 state->result->u.parenIndex = num - 1;
1282 state->progLength
1283 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1284 break;
1285 /* Control escape */
1286 case 'f':
1287 c = 0xC;
1288 goto doFlat;
1289 case 'n':
1290 c = 0xA;
1291 goto doFlat;
1292 case 'r':
1293 c = 0xD;
1294 goto doFlat;
1295 case 't':
1296 c = 0x9;
1297 goto doFlat;
1298 case 'v':
1299 c = 0xB;
1300 goto doFlat;
1301 /* Control letter */
1302 case 'c':
1303 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1304 c = (jschar) (*state->cp++ & 0x1F);
1305 } else {
1306 /* back off to accepting the original '\' as a literal */
1307 --state->cp;
1308 c = '\\';
1309 }
1310 goto doFlat;
1311 /* HexEscapeSequence */
1312 case 'x':
1313 nDigits = 2;
1314 goto lexHex;
1315 /* UnicodeEscapeSequence */
1316 case 'u':
1317 nDigits = 4;
1318 lexHex:
1319 n = 0;
1320 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1321 uintN digit;
1322 c = *state->cp++;
1323 if (!isASCIIHexDigit(c, &digit)) {
1324 /*
1325 * Back off to accepting the original 'u' or 'x' as a
1326 * literal.
1327 */
1328 state->cp -= i + 2;
1329 n = *state->cp++;
1330 break;
1331 }
1332 n = (n << 4) | digit;
1333 }
1334 c = (jschar) n;
1335 goto doFlat;
1336 /* Character class escapes */
1337 case 'd':
1338 state->result = NewRENode(state, REOP_DIGIT);
1339 doSimple:
1340 if (!state->result)
1341 return JS_FALSE;
1342 state->progLength++;
1343 break;
1344 case 'D':
1345 state->result = NewRENode(state, REOP_NONDIGIT);
1346 goto doSimple;
1347 case 's':
1348 state->result = NewRENode(state, REOP_SPACE);
1349 goto doSimple;
1350 case 'S':
1351 state->result = NewRENode(state, REOP_NONSPACE);
1352 goto doSimple;
1353 case 'w':
1354 state->result = NewRENode(state, REOP_ALNUM);
1355 goto doSimple;
1356 case 'W':
1357 state->result = NewRENode(state, REOP_NONALNUM);
1358 goto doSimple;
1359 /* IdentityEscape */
1360 default:
1361 state->result = NewRENode(state, REOP_FLAT);
1362 if (!state->result)
1363 return JS_FALSE;
1364 state->result->u.flat.chr = c;
1365 state->result->u.flat.length = 1;
1366 state->result->kid = (void *) (state->cp - 1);
1367 state->progLength += 3;
1368 break;
1369 }
1370 break;
1371 case '[':
1372 state->result = NewRENode(state, REOP_CLASS);
1373 if (!state->result)
1374 return JS_FALSE;
1375 termStart = state->cp;
1376 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1377 for (;;) {
1378 if (state->cp == state->cpend) {
1379 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1380 JSMSG_UNTERM_CLASS, termStart);
1381
1382 return JS_FALSE;
1383 }
1384 if (*state->cp == '\\') {
1385 state->cp++;
1386 if (state->cp != state->cpend)
1387 state->cp++;
1388 continue;
1389 }
1390 if (*state->cp == ']') {
1391 state->result->u.ucclass.kidlen = state->cp - termStart;
1392 break;
1393 }
1394 state->cp++;
1395 }
1396 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1397 if (!state->classCache[i].start) {
1398 state->classCache[i].start = termStart;
1399 state->classCache[i].length = state->result->u.ucclass.kidlen;
1400 state->classCache[i].index = state->classCount;
1401 break;
1402 }
1403 if (state->classCache[i].length ==
1404 state->result->u.ucclass.kidlen) {
1405 for (n = 0; ; n++) {
1406 if (n == state->classCache[i].length) {
1407 state->result->u.ucclass.index
1408 = state->classCache[i].index;
1409 goto claim;
1410 }
1411 if (state->classCache[i].start[n] != termStart[n])
1412 break;
1413 }
1414 }
1415 }
1416 state->result->u.ucclass.index = state->classCount++;
1417
1418 claim:
1419 /*
1420 * Call CalculateBitmapSize now as we want any errors it finds
1421 * to be reported during the parse phase, not at execution.
1422 */
1423 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1424 return JS_FALSE;
1425 /*
1426 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1427 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1428 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1429 */
1430 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1431 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1432 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1433 return JS_FALSE;
1434 }
1435 state->classBitmapsMem += n;
1436 /* CLASS, <index> */
1437 state->progLength
1438 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1439 break;
1440
1441 case '.':
1442 state->result = NewRENode(state, REOP_DOT);
1443 goto doSimple;
1444
1445 case '{':
1446 {
1447 const jschar *errp = state->cp--;
1448 intN err;
1449
1450 err = ParseMinMaxQuantifier(state, JS_TRUE);
1451 state->cp = errp;
1452
1453 if (err < 0)
1454 goto asFlat;
1455
1456 /* FALL THROUGH */
1457 }
1458 case '*':
1459 case '+':
1460 case '?':
1461 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1462 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1463 return JS_FALSE;
1464 default:
1465 asFlat:
1466 state->result = NewRENode(state, REOP_FLAT);
1467 if (!state->result)
1468 return JS_FALSE;
1469 state->result->u.flat.chr = c;
1470 state->result->u.flat.length = 1;
1471 state->result->kid = (void *) (state->cp - 1);
1472 state->progLength += 3;
1473 break;
1474 }
1475 return ParseQuantifier(state);
1476 }
1477
1478 static JSBool
1479 ParseQuantifier(CompilerState *state)
1480 {
1481 RENode *term;
1482 term = state->result;
1483 if (state->cp < state->cpend) {
1484 switch (*state->cp) {
1485 case '+':
1486 state->result = NewRENode(state, REOP_QUANT);
1487 if (!state->result)
1488 return JS_FALSE;
1489 state->result->u.range.min = 1;
1490 state->result->u.range.max = (uintN)-1;
1491 /* <PLUS>, <next> ... <ENDCHILD> */
1492 state->progLength += 4;
1493 goto quantifier;
1494 case '*':
1495 state->result = NewRENode(state, REOP_QUANT);
1496 if (!state->result)
1497 return JS_FALSE;
1498 state->result->u.range.min = 0;
1499 state->result->u.range.max = (uintN)-1;
1500 /* <STAR>, <next> ... <ENDCHILD> */
1501 state->progLength += 4;
1502 goto quantifier;
1503 case '?':
1504 state->result = NewRENode(state, REOP_QUANT);
1505 if (!state->result)
1506 return JS_FALSE;
1507 state->result->u.range.min = 0;
1508 state->result->u.range.max = 1;
1509 /* <OPT>, <next> ... <ENDCHILD> */
1510 state->progLength += 4;
1511 goto quantifier;
1512 case '{': /* balance '}' */
1513 {
1514 intN err;
1515 const jschar *errp = state->cp;
1516
1517 err = ParseMinMaxQuantifier(state, JS_FALSE);
1518 if (err == 0)
1519 goto quantifier;
1520 if (err == -1)
1521 return JS_TRUE;
1522
1523 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1524 return JS_FALSE;
1525 }
1526 default:;
1527 }
1528 }
1529 return JS_TRUE;
1530
1531 quantifier:
1532 if (state->treeDepth == TREE_DEPTH_MAX) {
1533 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1534 return JS_FALSE;
1535 }
1536
1537 ++state->treeDepth;
1538 ++state->cp;
1539 state->result->kid = term;
1540 if (state->cp < state->cpend && *state->cp == '?') {
1541 ++state->cp;
1542 state->result->u.range.greedy = JS_FALSE;
1543 } else {
1544 state->result->u.range.greedy = JS_TRUE;
1545 }
1546 return JS_TRUE;
1547 }
1548
1549 static intN
1550 ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues)
1551 {
1552 uintN min, max;
1553 jschar c;
1554 const jschar *errp = state->cp++;
1555
1556 c = *state->cp;
1557 if (JS7_ISDEC(c)) {
1558 ++state->cp;
1559 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1560 c = *state->cp;
1561
1562 if (!ignoreValues && min == OVERFLOW_VALUE)
1563 return JSMSG_MIN_TOO_BIG;
1564
1565 if (c == ',') {
1566 c = *++state->cp;
1567 if (JS7_ISDEC(c)) {
1568 ++state->cp;
1569 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1570 c = *state->cp;
1571 if (!ignoreValues && max == OVERFLOW_VALUE)
1572 return JSMSG_MAX_TOO_BIG;
1573 if (!ignoreValues && min > max)
1574 return JSMSG_OUT_OF_ORDER;
1575 } else {
1576 max = (uintN)-1;
1577 }
1578 } else {
1579 max = min;
1580 }
1581 if (c == '}') {
1582 state->result = NewRENode(state, REOP_QUANT);
1583 if (!state->result)
1584 return JSMSG_OUT_OF_MEMORY;
1585 state->result->u.range.min = min;
1586 state->result->u.range.max = max;
1587 /*
1588 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1589 * where <max> is written as compact(max+1) to make
1590 * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1591 */
1592 state->progLength += (1 + GetCompactIndexWidth(min)
1593 + GetCompactIndexWidth(max + 1)
1594 +3);
1595 return 0;
1596 }
1597 }
1598
1599 state->cp = errp;
1600 return -1;
1601 }
1602
1603 static JSBool
1604 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
1605 {
1606 ptrdiff_t offset = target - jump;
1607
1608 /* Check that target really points forward. */
1609 JS_ASSERT(offset >= 2);
1610 if ((size_t)offset > OFFSET_MAX)
1611 return JS_FALSE;
1612
1613 jump[0] = JUMP_OFFSET_HI(offset);
1614 jump[1] = JUMP_OFFSET_LO(offset);
1615 return JS_TRUE;
1616 }
1617
1618 /* Copy the charset data from a character class node to the charset list
1619 * in the regexp object. */
1620 static JS_ALWAYS_INLINE RECharSet *
1621 InitNodeCharSet(JSRegExp *re, RENode *node)
1622 {
1623 RECharSet *charSet = &re->classList[node->u.ucclass.index];
1624 charSet->converted = JS_FALSE;
1625 charSet->length = node->u.ucclass.bmsize;
1626 charSet->u.src.startIndex = node->u.ucclass.startIndex;
1627 charSet->u.src.length = node->u.ucclass.kidlen;
1628 charSet->sense = node->u.ucclass.sense;
1629 return charSet;
1630 }
1631
1632 /*
1633 * Generate bytecode for the tree rooted at t using an explicit stack instead
1634 * of recursion.
1635 */
1636 static jsbytecode *
1637 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
1638 jsbytecode *pc, RENode *t)
1639 {
1640 EmitStateStackEntry *emitStateSP, *emitStateStack;
1641 REOp op;
1642
1643 if (treeDepth == 0) {
1644 emitStateStack = NULL;
1645 } else {
1646 emitStateStack =
1647 (EmitStateStackEntry *)JS_malloc(state->context,
1648 sizeof(EmitStateStackEntry) *
1649 treeDepth);
1650 if (!emitStateStack)
1651 return NULL;
1652 }
1653 emitStateSP = emitStateStack;
1654 op = t->op;
1655 JS_ASSERT(op < REOP_LIMIT);
1656
1657 for (;;) {
1658 *pc++ = op;
1659 switch (op) {
1660 case REOP_EMPTY:
1661 --pc;
1662 break;
1663
1664 case REOP_ALTPREREQ2:
1665 case REOP_ALTPREREQ:
1666 JS_ASSERT(emitStateSP);
1667 emitStateSP->altHead = pc - 1;
1668 emitStateSP->endTermFixup = pc;
1669 pc += OFFSET_LEN;
1670 SET_ARG(pc, t->u.altprereq.ch1);
1671 pc += ARG_LEN;
1672 SET_ARG(pc, t->u.altprereq.ch2);
1673 pc += ARG_LEN;
1674
1675 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1676 pc += OFFSET_LEN;
1677
1678 emitStateSP->continueNode = t;
1679 emitStateSP->continueOp = REOP_JUMP;
1680 emitStateSP->jumpToJumpFlag = JS_FALSE;
1681 ++emitStateSP;
1682 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1683 t = (RENode *) t->kid;
1684 op = t->op;
1685 JS_ASSERT(op < REOP_LIMIT);
1686 continue;
1687
1688 case REOP_JUMP:
1689 emitStateSP->nextTermFixup = pc; /* offset to following term */
1690 pc += OFFSET_LEN;
1691 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
1692 goto jump_too_big;
1693 emitStateSP->continueOp = REOP_ENDALT;
1694 ++emitStateSP;
1695 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1696 t = (RENode *) t->u.kid2;
1697 op = t->op;
1698 JS_ASSERT(op < REOP_LIMIT);
1699 continue;
1700
1701 case REOP_ENDALT:
1702 /*
1703 * If we already patched emitStateSP->nextTermFixup to jump to
1704 * a nearer jump, to avoid 16-bit immediate offset overflow, we
1705 * are done here.
1706 */
1707 if (emitStateSP->jumpToJumpFlag)
1708 break;
1709
1710 /*
1711 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
1712 * REOP_ENDALT is executed only on successful match of the last
1713 * alternate in a group.
1714 */
1715 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1716 goto jump_too_big;
1717 if (t->op != REOP_ALT) {
1718 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
1719 goto jump_too_big;
1720 }
1721
1722 /*
1723 * If the program is bigger than the REOP_JUMP offset range, then
1724 * we must check for alternates before this one that are part of
1725 * the same group, and fix up their jump offsets to target jumps
1726 * close enough to fit in a 16-bit unsigned offset immediate.
1727 */
1728 if ((size_t)(pc - re->program) > OFFSET_MAX &&
1729 emitStateSP > emitStateStack) {
1730 EmitStateStackEntry *esp, *esp2;
1731 jsbytecode *alt, *jump;
1732 ptrdiff_t span, header;
1733
1734 esp2 = emitStateSP;
1735 alt = esp2->altHead;
1736 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
1737 if (esp->continueOp == REOP_ENDALT &&
1738 !esp->jumpToJumpFlag &&
1739 esp->nextTermFixup + OFFSET_LEN == alt &&
1740 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
1741 ? esp->endTermFixup
1742 : esp->nextTermFixup)) > OFFSET_MAX) {
1743 alt = esp->altHead;
1744 jump = esp->nextTermFixup;
1745
1746 /*
1747 * The span must be 1 less than the distance from
1748 * jump offset to jump offset, so we actually jump
1749 * to a REOP_JUMP bytecode, not to its offset!
1750 */
1751 for (;;) {
1752 JS_ASSERT(jump < esp2->nextTermFixup);
1753 span = esp2->nextTermFixup - jump - 1;
1754 if ((size_t)span <= OFFSET_MAX)
1755 break;
1756 do {
1757 if (--esp2 == esp)
1758 goto jump_too_big;
1759 } while (esp2->continueOp != REOP_ENDALT);
1760 }
1761
1762 jump[0] = JUMP_OFFSET_HI(span);
1763 jump[1] = JUMP_OFFSET_LO(span);
1764
1765 if (esp->continueNode->op != REOP_ALT) {
1766 /*
1767 * We must patch the offset at esp->endTermFixup
1768 * as well, for the REOP_ALTPREREQ{,2} opcodes.
1769 * If we're unlucky and endTermFixup is more than
1770 * OFFSET_MAX bytes from its target, we cheat by
1771 * jumping 6 bytes to the jump whose offset is at
1772 * esp->nextTermFixup, which has the same target.
1773 */
1774 jump = esp->endTermFixup;
1775 header = esp->nextTermFixup - jump;
1776 span += header;
1777 if ((size_t)span > OFFSET_MAX)
1778 span = header;
1779
1780 jump[0] = JUMP_OFFSET_HI(span);
1781 jump[1] = JUMP_OFFSET_LO(span);
1782 }
1783
1784 esp->jumpToJumpFlag = JS_TRUE;
1785 }
1786 }
1787 }
1788 break;
1789
1790 case REOP_ALT:
1791 JS_ASSERT(emitStateSP);
1792 emitStateSP->altHead = pc - 1;
1793 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1794 pc += OFFSET_LEN;
1795 emitStateSP->continueNode = t;
1796 emitStateSP->continueOp = REOP_JUMP;
1797 emitStateSP->jumpToJumpFlag = JS_FALSE;
1798 ++emitStateSP;
1799 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1800 t = (RENode *) t->kid;
1801 op = t->op;
1802 JS_ASSERT(op < REOP_LIMIT);
1803 continue;
1804
1805 case REOP_FLAT:
1806 /*
1807 * Coalesce FLATs if possible and if it would not increase bytecode
1808 * beyond preallocated limit. The latter happens only when bytecode
1809 * size for coalesced string with offset p and length 2 exceeds 6
1810 * bytes preallocated for 2 single char nodes, i.e. when
1811 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
1812 * GetCompactIndexWidth(p) > 4.
1813 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
1814 * nodes strictly decreases bytecode size, the check has to be
1815 * done only for the first coalescing.
1816 */
1817 if (t->kid &&
1818 GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4)
1819 {
1820 while (t->next &&
1821 t->next->op == REOP_FLAT &&
1822 (jschar*)t->kid + t->u.flat.length ==
1823 (jschar*)t->next->kid) {
1824 t->u.flat.length += t->next->u.flat.length;
1825 t->next = t->next->next;
1826 }
1827 }
1828 if (t->kid && t->u.flat.length > 1) {
1829 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
1830 pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin);
1831 pc = WriteCompactIndex(pc, t->u.flat.length);
1832 } else if (t->u.flat.chr < 256) {
1833 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
1834 *pc++ = (jsbytecode) t->u.flat.chr;
1835 } else {
1836 pc[-1] = (state->flags & JSREG_FOLD)
1837 ? REOP_UCFLAT1i
1838 : REOP_UCFLAT1;
1839 SET_ARG(pc, t->u.flat.chr);
1840 pc += ARG_LEN;
1841 }
1842 break;
1843
1844 case REOP_LPAREN:
1845 JS_ASSERT(emitStateSP);
1846 pc = WriteCompactIndex(pc, t->u.parenIndex);
1847 emitStateSP->continueNode = t;
1848 emitStateSP->continueOp = REOP_RPAREN;
1849 ++emitStateSP;
1850 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1851 t = (RENode *) t->kid;
1852 op = t->op;
1853 continue;
1854
1855 case REOP_RPAREN:
1856 pc = WriteCompactIndex(pc, t->u.parenIndex);
1857 break;
1858
1859 case REOP_BACKREF:
1860 pc = WriteCompactIndex(pc, t->u.parenIndex);
1861 break;
1862
1863 case REOP_ASSERT:
1864 JS_ASSERT(emitStateSP);
1865 emitStateSP->nextTermFixup = pc;
1866 pc += OFFSET_LEN;
1867 emitStateSP->continueNode = t;
1868 emitStateSP->continueOp = REOP_ASSERTTEST;
1869 ++emitStateSP;
1870 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1871 t = (RENode *) t->kid;
1872 op = t->op;
1873 continue;
1874
1875 case REOP_ASSERTTEST:
1876 case REOP_ASSERTNOTTEST:
1877 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1878 goto jump_too_big;
1879 break;
1880
1881 case REOP_ASSERT_NOT:
1882 JS_ASSERT(emitStateSP);
1883 emitStateSP->nextTermFixup = pc;
1884 pc += OFFSET_LEN;
1885 emitStateSP->continueNode = t;
1886 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
1887 ++emitStateSP;
1888 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1889 t = (RENode *) t->kid;
1890 op = t->op;
1891 continue;
1892
1893 case REOP_QUANT:
1894 JS_ASSERT(emitStateSP);
1895 if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) {
1896 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
1897 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
1898 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
1899 } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) {
1900 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
1901 } else {
1902 if (!t->u.range.greedy)
1903 pc[-1] = REOP_MINIMALQUANT;
1904 pc = WriteCompactIndex(pc, t->u.range.min);
1905 /*
1906 * Write max + 1 to avoid using size_t(max) + 1 bytes
1907 * for (uintN)-1 sentinel.
1908 */
1909 pc = WriteCompactIndex(pc, t->u.range.max + 1);
1910 }
1911 emitStateSP->nextTermFixup = pc;
1912 pc += OFFSET_LEN;
1913 emitStateSP->continueNode = t;
1914 emitStateSP->continueOp = REOP_ENDCHILD;
1915 ++emitStateSP;
1916 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1917 t = (RENode *) t->kid;
1918 op = t->op;
1919 continue;
1920
1921 case REOP_ENDCHILD:
1922 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1923 goto jump_too_big;
1924 break;
1925
1926 case REOP_CLASS:
1927 if (!t->u.ucclass.sense)
1928 pc[-1] = REOP_NCLASS;
1929 pc = WriteCompactIndex(pc, t->u.ucclass.index);
1930 InitNodeCharSet(re, t);
1931 break;
1932
1933 default:
1934 break;
1935 }
1936
1937 t = t->next;
1938 if (t) {
1939 op = t->op;
1940 } else {
1941 if (emitStateSP == emitStateStack)
1942 break;
1943 --emitStateSP;
1944 t = emitStateSP->continueNode;
1945 op = (REOp) emitStateSP->continueOp;
1946 }
1947 }
1948
1949 cleanup:
1950 if (emitStateStack)
1951 JS_free(state->context, emitStateStack);
1952 return pc;
1953
1954 jump_too_big:
1955 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1956 pc = NULL;
1957 goto cleanup;
1958 }
1959
1960 static JSBool
1961 CompileRegExpToAST(JSContext* cx, JSTokenStream* ts,
1962 JSString* str, uintN flags, CompilerState& state)
1963 {
1964 uintN i;
1965 size_t len;
1966
1967 len = JSSTRING_LENGTH(str);
1968
1969 state.context = cx;
1970 state.tokenStream = ts;
1971 state.cp = js_UndependString(cx, str);
1972 if (!state.cp)
1973 return JS_FALSE;
1974 state.cpbegin = state.cp;
1975 state.cpend = state.cp + len;
1976 state.flags = flags;
1977 state.parenCount = 0;
1978 state.classCount = 0;
1979 state.progLength = 0;
1980 state.treeDepth = 0;
1981 state.classBitmapsMem = 0;
1982 for (i = 0; i < CLASS_CACHE_SIZE; i++)
1983 state.classCache[i].start = NULL;
1984
1985 if (len != 0 && (flags & JSREG_FLAT)) {
1986 state.result = NewRENode(&state, REOP_FLAT);
1987 if (!state.result)
1988 return JS_FALSE;
1989 state.result->u.flat.chr = *state.cpbegin;
1990 state.result->u.flat.length = len;
1991 state.result->kid = (void *) state.cpbegin;
1992 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
1993 state.progLength += 1 + GetCompactIndexWidth(0)
1994 + GetCompactIndexWidth(len);
1995 return JS_TRUE;
1996 }
1997
1998 return ParseRegExp(&state);
1999 }
2000
2001 #ifdef JS_TRACER
2002 typedef List<LIns*, LIST_NonGCObjects> LInsList;
2003
2004 /* Dummy GC for nanojit placement new. */
2005 static GC gc;
2006
2007 static void*
2008 HashRegExp(uint16 flags, jschar* s, size_t n)
2009 {
2010 uint32 h;
2011
2012 for (h = 0; n; s++, n--)
2013 h = JS_ROTATE_LEFT32(h, 4) ^ *s;
2014 return (void*)(h + flags);
2015 }
2016
2017 struct RESideExit : public SideExit {
2018 size_t re_length;
2019 uint16 re_flags;
2020 jschar re_chars[1];
2021 };
2022
2023 /* Return the cached fragment for the given regexp, or NULL. */
2024 static Fragment*
2025 LookupNativeRegExp(JSContext* cx, void* hash, uint16 re_flags,
2026 jschar* re_chars, size_t re_length)
2027 {
2028 Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento;
2029 Fragment* fragment = fragmento->getLoop(hash);
2030 while (fragment) {
2031 if (fragment->lastIns) {
2032 RESideExit* exit = (RESideExit*)fragment->lastIns->record()->exit;
2033 if (exit->re_flags == re_flags &&
2034 exit->re_length == re_length &&
2035 !memcmp(exit->re_chars, re_chars, re_length * sizeof(jschar))) {
2036 return fragment;
2037 }
2038 }
2039 fragment = fragment->peer;
2040 }
2041 return NULL;
2042 }
2043
2044 static JSBool
2045 ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet);
2046
2047 class RegExpNativeCompiler {
2048 private:
2049 JSContext* cx;
2050 JSRegExp* re;
2051 CompilerState* cs; /* RegExp to compile */
2052 Fragment* fragment;
2053 LirWriter* lir;
2054 LirBufWriter* lirBufWriter; /* for skip */
2055
2056 LIns* state;
2057 LIns* gdata;
2058 LIns* cpend;
2059
2060 JSBool isCaseInsensitive() const { return (cs->flags & JSREG_FOLD) != 0; }
2061
2062 JSBool targetCurrentPoint(LIns* ins)
2063 {
2064 if (fragment->lirbuf->outOMem())
2065 return JS_FALSE;
2066 ins->target(lir->ins0(LIR_label));
2067 return JS_TRUE;
2068 }
2069
2070 JSBool targetCurrentPoint(LInsList& fails)
2071 {
2072 if (fragment->lirbuf->outOMem())
2073 return JS_FALSE;
2074 LIns* fail = lir->ins0(LIR_label);
2075 for (size_t i = 0; i < fails.size(); ++i) {
2076 fails[i]->target(fail);
2077 }
2078 fails.clear();
2079 return JS_TRUE;
2080 }
2081
2082 /*
2083 * These functions return the new position after their match operation,
2084 * or NULL if there was an error.
2085 */
2086 LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails)
2087 {
2088 return pos;
2089 }
2090
2091 LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails)
2092 {
2093 /*
2094 * Fast case-insensitive test for ASCII letters: convert text
2095 * char to lower case by bit-or-ing in 32 and compare.
2096 */
2097 JSBool useFastCI = JS_FALSE;
2098 jschar ch2 = ch; /* 2nd char to test for if ci */
2099 if (cs->flags & JSREG_FOLD) {
2100 if ((L'A' <= ch && ch <= L'Z') || (L'a' <= ch && ch <= L'z')) {
2101 ch |= 32;
2102 ch2 = ch;
2103 useFastCI = JS_TRUE;
2104 } else if (JS_TOLOWER(ch) != ch) {
2105 ch2 = JS_TOLOWER(ch);
2106 ch = JS_TOUPPER(ch);
2107 }
2108 }
2109
2110 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0);
2111 fails.add(to_fail);
2112 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0));
2113 LIns* comp_ch = useFastCI ?
2114 lir->ins2(LIR_or, text_ch, lir->insImm(32)) :
2115 text_ch;
2116 if (ch == ch2) {
2117 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0));
2118 } else {
2119 LIns* to_ok = lir->insBranch(LIR_jt, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0);
2120 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch2)), 0));
2121 if (!targetCurrentPoint(to_ok))
2122 return NULL;
2123 }
2124
2125 return lir->ins2(LIR_piadd, pos, lir->insImm(2));
2126 }
2127
2128 LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos,
2129 LInsList& fails)
2130 {
2131 #ifdef IS_BIG_ENDIAN
2132 uint32 word = (ch1 << 16) | ch2;
2133 #else
2134 uint32 word = (ch2 << 16) | ch1;
2135 #endif
2136 /*
2137 * Fast case-insensitive test for ASCII letters: convert text
2138 * char to lower case by bit-or-ing in 32 and compare.
2139 */
2140 JSBool useFastCI = JS_FALSE;
2141 union { jschar c[2]; uint32 i; } mask;
2142 if (cs->flags & JSREG_FOLD) {
2143 JSBool mask1 = (L'A' <= ch1 && ch1 <= L'Z') || (L'a' <= ch1 && ch1 <= L'z');
2144 JSBool mask2 = (L'A' <= ch2 && ch2 <= L'Z') || (L'a' <= ch2 && ch2 <= L'z');
2145 if ((!mask1 && JS_TOLOWER(ch1) != ch1) || (!mask2 && JS_TOLOWER(ch2) != ch2)) {
2146 pos = compileFlatSingleChar(ch1, pos, fails);
2147 if (!pos) return NULL;
2148 return compileFlatSingleChar(ch2, pos, fails);
2149 }
2150
2151 mask.c[0] = mask1 ? 0x0020 : 0x0;
2152 mask.c[1] = mask2 ? 0x0020 : 0x0;
2153
2154 if (mask.i) {
2155 word |= mask.i;
2156 useFastCI = JS_TRUE;
2157 }
2158 }
2159
2160 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, lir->ins2(LIR_sub, cpend, lir->insImm(2))), 0);
2161 fails.add(to_fail);
2162 LIns* text_word = lir->insLoad(LIR_ld, pos, lir->insImm(0));
2163 LIns* comp_word = useFastCI ?
2164 lir->ins2(LIR_or, text_word, lir->insImm(mask.i)) :
2165 text_word;
2166 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_word, lir->insImm(word)), 0));
2167
2168 return lir->ins2(LIR_piadd, pos, lir->insImm(4));
2169 }
2170
2171 LIns* compileClass(RENode* node, LIns* pos, LInsList& fails)
2172 {
2173 if (!node->u.ucclass.sense)
2174 return JS_FALSE;
2175 /*
2176 * If we share generated native code, we need to make a copy
2177 * of the bitmap because the original regexp's copy is destroyed
2178 * when that regexp is.
2179 */
2180 RECharSet *charSet = &re->classList[node->u.ucclass.index];
2181 size_t bitmapLen = (charSet->length >> 3) + 1;
2182 /* skip() can't hold large data blocks. */
2183 if (bitmapLen > 1024)
2184 return NULL;
2185 /* The following line allocates charSet.u.bits if successful. */
2186 if (!charSet->converted && !ProcessCharSet(cx, re, charSet))
2187 return NULL;
2188 LIns* skip = lirBufWriter->skip(bitmapLen);
2189 if (fragment->lirbuf->outOMem())
2190 return NULL;
2191 void* bitmapData = skip->payload();
2192 memcpy(bitmapData, charSet->u.bits, bitmapLen);
2193
2194 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0);
2195 fails.add(to_fail);
2196 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, lir->insImm(0));
2197 fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_le, text_ch, lir->insImm(charSet->length)), 0));
2198 LIns* byteIndex = lir->ins2(LIR_rsh, text_ch, lir->insImm(3));
2199 LIns* bitmap = lir->insImmPtr(bitmapData);
2200 LIns* byte = lir->insLoad(LIR_ldcb, lir->ins2(LIR_piadd, bitmap, byteIndex), (int) 0);
2201 LIns* bitMask = lir->ins2(LIR_lsh, lir->insImm(1),
2202 lir->ins2(LIR_and, text_ch, lir->insImm(0x7)));
2203 LIns* test = lir->ins2(LIR_eq, lir->ins2(LIR_and, byte, bitMask), lir->insImm(0));
2204
2205 LIns* to_next = lir->insBranch(LIR_jt, test, 0);
2206 fails.add(to_next);
2207 return lir->ins2(LIR_piadd, pos, lir->insImm(2));
2208 }
2209
2210 LIns* compileAlt(RENode* node, LIns* pos, LInsList& fails)
2211 {
2212 LInsList kidFails(NULL);
2213 if (!compileNode((RENode *) node->kid, pos, kidFails))
2214 return NULL;
2215 if (!compileNode(node->next, pos, kidFails))
2216 return NULL;
2217
2218 if (!targetCurrentPoint(kidFails))
2219 return NULL;
2220 if (!compileNode(node->u.altprereq.kid2, pos, fails))
2221 return NULL;
2222 /*
2223 * Disable compilation for any regexp where something follows an
2224 * alternation. To make this work, we need to redesign to either
2225 * (a) pass around continuations so we know the right place to go
2226 * when we logically return, or (b) generate explicit backtracking
2227 * code.
2228 */
2229 if (node->next)
2230 return NULL;
2231 return pos;
2232 }
2233
2234 #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC)
2235 /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from
2236 * a pointer which is only 2-byte aligned.
2237 */
2238 #undef USE_DOUBLE_CHAR_MATCH
2239 #else
2240 #define USE_DOUBLE_CHAR_MATCH
2241 #endif
2242
2243 JSBool compileNode(RENode* node, LIns* pos, LInsList& fails)
2244 {
2245 for (; node; node = node->next) {
2246 if (fragment->lirbuf->outOMem())
2247 return JS_FALSE;
2248
2249 switch (node->op) {
2250 case REOP_EMPTY:
2251 pos = compileEmpty(node, pos, fails);
2252 break;
2253 case REOP_FLAT:
2254 #ifdef USE_DOUBLE_CHAR_MATCH
2255 if (node->u.flat.length == 1) {
2256 if (node->next && node->next->op == REOP_FLAT &&
2257 node->next->u.flat.length == 1) {
2258 pos = compileFlatDoubleChar(node->u.flat.chr,
2259 node->next->u.flat.chr,
2260 pos, fails);
2261 node = node->next;
2262 } else {
2263 pos = compileFlatSingleChar(node->u.flat.chr, pos, fails);
2264 }
2265 } else {
2266 size_t i;
2267 for (i = 0; i < node->u.flat.length - 1; i += 2) {
2268 if (fragment->lirbuf->outOMem())
2269 return JS_FALSE;
2270 pos = compileFlatDoubleChar(((jschar*) node->kid)[i],
2271 ((jschar*) node->kid)[i+1],
2272 pos, fails);
2273 if (!pos) break;
2274 }
2275 if (pos && i == node->u.flat.length - 1)
2276 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2277 }
2278 #else
2279 if (node->u.flat.length == 1) {
2280 pos = compileFlatSingleChar(node->u.flat.chr, pos, fails);
2281 } else {
2282 for (size_t i = 0; i < node->u.flat.length; i++) {
2283 if (fragment->lirbuf->outOMem())
2284 return JS_FALSE;
2285 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2286 if (!pos) break;
2287 }
2288 }
2289 #endif
2290 break;
2291 case REOP_ALT:
2292 case REOP_ALTPREREQ:
2293 pos = compileAlt(node, pos, fails);
2294 break;
2295 case REOP_CLASS:
2296 pos = compileClass(node, pos, fails);
2297 break;
2298 default:
2299 return JS_FALSE;
2300 }
2301 if (!pos)
2302 return JS_FALSE;
2303 }
2304
2305 lir->insStorei(pos, state, (int) offsetof(REMatchState, cp));
2306 lir->ins1(LIR_ret, state);
2307 return JS_TRUE;
2308 }
2309
2310 JSBool compileSticky(RENode* root, LIns* start)
2311 {
2312 LInsList fails(NULL);
2313 if (!compileNode(root, start, fails))
2314 return JS_FALSE;
2315 if (!targetCurrentPoint(fails))
2316 return JS_FALSE;
2317 lir->ins1(LIR_ret, lir->insImm(0));
2318 return JS_TRUE;
2319 }
2320
2321 JSBool compileAnchoring(RENode* root, LIns* start)
2322 {
2323 /* Even at the end, the empty regexp would match. */
2324 LIns* to_next = lir->insBranch(LIR_jf,
2325 lir->ins2(LIR_le, start, cpend), 0);
2326 LInsList fails(NULL);
2327 if (!compileNode(root, start, fails))
2328 return JS_FALSE;
2329
2330 if (!targetCurrentPoint(to_next))
2331 return JS_FALSE;
2332 lir->ins1(LIR_ret, lir->insImm(0));
2333
2334 if (!targetCurrentPoint(fails))
2335 return JS_FALSE;
2336 lir->insStorei(lir->ins2(LIR_piadd, start, lir->insImm(2)), gdata,
2337 (int) offsetof(REGlobalData, skipped));
2338
2339 return JS_TRUE;
2340 }
2341
2342 inline LIns*
2343 addName(LirBuffer* lirbuf, LIns* ins, const char* name)
2344 {
2345 #ifdef NJ_VERBOSE
2346 debug_only_v(lirbuf->names->addName(ins, name);)
2347 #endif
2348 return ins;
2349 }
2350
2351 /*
2352 * Insert the side exit and guard record for a compiled regexp. Most
2353 * of the fields are not used. The important part is the regexp source
2354 * and flags, which we use as the fragment lookup key.
2355 */
2356 GuardRecord* insertGuard(jschar* re_chars, size_t re_length)
2357 {
2358 LIns* skip = lirBufWriter->skip(sizeof(GuardRecord) +
2359 sizeof(RESideExit) +
2360 (re_length-1) * sizeof(jschar));
2361 GuardRecord* guard = (GuardRecord *) skip->payload();
2362 memset(guard, 0, sizeof(*guard));
2363 RESideExit* exit = (RESideExit*)(guard+1);
2364 guard->exit = exit;
2365 guard->exit->target = fragment;
2366 exit->re_flags = re->flags;
2367 exit->re_length = re_length;
2368 memcpy(exit->re_chars, re_chars, re_length * sizeof(jschar));
2369 fragment->lastIns = lir->insGuard(LIR_loop, lir->insImm(1), skip);
2370 return guard;
2371 }
2372
2373 public:
2374 RegExpNativeCompiler(JSRegExp* re, CompilerState* cs, Fragment* fragment)
2375 : re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL) { }
2376
2377 JSBool compile(JSContext* cx)
2378 {
2379 GuardRecord* guard = NULL;
2380 LIns* start;
2381 bool oom = false;
2382 jschar* re_chars;
2383 size_t re_length;
2384 Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento;
2385
2386 JSSTRING_CHARS_AND_LENGTH(re->source, re_chars, re_length);
2387 /*
2388 * If the regexp is too long nanojit will assert when we
2389 * try to insert the guard record.
2390 */
2391 if (re_length > 1024)
2392 return JS_FALSE;
2393
2394 this->cx = cx;
2395 /* At this point we have an empty fragment. */
2396 LirBuffer* lirbuf = fragment->lirbuf;
2397 if (lirbuf->outOMem())
2398 goto fail;
2399 /* FIXME Use bug 463260 smart pointer when available. */
2400 lir = lirBufWriter = new (&gc) LirBufWriter(lirbuf);
2401
2402 /* FIXME Use bug 463260 smart pointer when available. */
2403 #ifdef NJ_VERBOSE
2404 debug_only_v(fragment->lirbuf->names = new (&gc) LirNameMap(&gc, NULL, fragmento->labels);)
2405 #endif
2406 /* FIXME Use bug 463260 smart pointer when available. */
2407 #ifdef NJ_VERBOSE
2408 debug_only_v(lir = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);)
2409 #endif
2410
2411 lir->ins0(LIR_start);
2412 lirbuf->state = state = addName(lirbuf, lir->insParam(0, 0), "state");
2413 lirbuf->param1 = gdata = addName(lirbuf, lir->insParam(1, 0), "gdata");
2414 start = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, (int) offsetof(REGlobalData, skipped)), "start");
2415 cpend = addName(lirbuf, lir->insLoad(LIR_ldp, lirbuf->param1, offsetof(REGlobalData, cpend)), "cpend");
2416
2417 if (cs->flags & JSREG_STICKY) {
2418 if (!compileSticky(cs->result, start))
2419 goto fail;
2420 } else {
2421 if (!compileAnchoring(cs->result, start))
2422 goto fail;
2423 }
2424
2425 guard = insertGuard(re_chars, re_length);
2426
2427 if (lirbuf->outOMem())
2428 goto fail;
2429 ::compile(fragmento->assm(), fragment);
2430 if (fragmento->assm()->error() != nanojit::None) {
2431 oom = fragmento->assm()->error() == nanojit::OutOMem;
2432 goto fail;
2433 }
2434
2435 delete lirBufWriter;
2436 #ifdef NJ_VERBOSE
2437 debug_only_v(delete lir;)
2438 #endif
2439 return JS_TRUE;
2440 fail:
2441 if (lirbuf->outOMem() || oom ||
2442 js_OverfullFragmento(&JS_TRACE_MONITOR(cx), fragmento)) {
2443 fragmento->clearFrags();
2444 lirbuf->rewind();
2445 } else {
2446 if (!guard) insertGuard(re_chars, re_length);
2447 fragment->blacklist();
2448 }
2449 delete lirBufWriter;
2450 #ifdef NJ_VERBOSE
2451 debug_only_v(delete lir;)
2452 #endif
2453 return JS_FALSE;
2454 }
2455 };
2456
2457 /*
2458 * Compile a regexp to native code in the given fragment.
2459 */
2460 static inline JSBool
2461 CompileRegExpToNative(JSContext* cx, JSRegExp* re, Fragment* fragment)
2462 {
2463 JSBool rv = JS_FALSE;
2464 void* mark;
2465 CompilerState state;
2466 RegExpNativeCompiler rc(re, &state, fragment);
2467
2468 JS_ASSERT(!fragment->code());
2469 JS_ASSERT(!fragment->isBlacklisted());
2470
2471 mark = JS_ARENA_MARK(&cx->tempPool);
2472 if (!CompileRegExpToAST(cx, NULL, re->source, re->flags, state)) {
2473 goto out;
2474 }
2475 rv = rc.compile(cx);
2476 out:
2477 JS_ARENA_RELEASE(&cx->tempPool, mark);
2478 return rv;
2479 }
2480
2481 /* Function type for a compiled native regexp. */
2482 typedef REMatchState* (FASTCALL *NativeRegExp)(REMatchState*, REGlobalData*);
2483
2484 /*
2485 * Return a compiled native regexp if one already exists or can be created
2486 * now, or NULL otherwise.
2487 */
2488 static NativeRegExp
2489 GetNativeRegExp(JSContext* cx, JSRegExp* re)
2490 {
2491 Fragment *fragment;
2492 jschar* re_chars;
2493 size_t re_length;
2494 Fragmento* fragmento = JS_TRACE_MONITOR(cx).reFragmento;
2495
2496 JSSTRING_CHARS_AND_LENGTH(re->source, re_chars, re_length);
2497 void* hash = HashRegExp(re->flags, re_chars, re_length);
2498 fragment = LookupNativeRegExp(cx, hash, re->flags, re_chars, re_length);
2499 if (fragment) {
2500 if (fragment->code())
2501 goto ok;
2502 if (fragment->isBlacklisted())
2503 return NULL;
2504 } else {
2505 fragment = fragmento->getAnchor(hash);
2506 fragment->lirbuf = JS_TRACE_MONITOR(cx).reLirBuf;
2507 fragment->root = fragment;
2508 }
2509
2510 if (!CompileRegExpToNative(cx, re, fragment))
2511 return NULL;
2512 ok:
2513 union { NIns *code; NativeRegExp func; } u;
2514 u.code = fragment->code();
2515 return u.func;
2516 }
2517 #endif
2518
2519 JSRegExp *
2520 js_NewRegExp(JSContext *cx, JSTokenStream *ts,
2521 JSString *str, uintN flags, JSBool flat)
2522 {
2523 JSRegExp *re;
2524 void *mark;
2525 CompilerState state;
2526 size_t resize;
2527 jsbytecode *endPC;
2528 uintN i;
2529
2530 re = NULL;
2531 mark = JS_ARENA_MARK(&cx->tempPool);
2532
2533 /*
2534 * Parsing the string as flat is now expressed internally using
2535 * a flag, so that we keep this information in the JSRegExp, but
2536 * we keep the 'flat' parameter for now for compatibility.
2537 */
2538 if (flat) flags |= JSREG_FLAT;
2539 if (!CompileRegExpToAST(cx, ts, str, flags, state))
2540 goto out;
2541
2542 resize = offsetof(JSRegExp, program) + state.progLength + 1;
2543 re = (JSRegExp *) JS_malloc(cx, resize);
2544 if (!re)
2545 goto out;
2546
2547 re->nrefs = 1;
2548 JS_ASSERT(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
2549 re->classCount = state.classCount;
2550 if (re->classCount) {
2551 re->classList = (RECharSet *)
2552 JS_malloc(cx, re->classCount * sizeof(RECharSet));
2553 if (!re->classList) {
2554 js_DestroyRegExp(cx, re);
2555 re = NULL;
2556 goto out;
2557 }
2558 for (i = 0; i < re->classCount; i++)
2559 re->classList[i].converted = JS_FALSE;
2560 } else {
2561 re->classList = NULL;
2562 }
2563
2564 /* Compile the bytecode version. */
2565 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
2566 if (!endPC) {
2567 js_DestroyRegExp(cx, re);
2568 re = NULL;
2569 goto out;
2570 }
2571 *endPC++ = REOP_END;
2572 /*
2573 * Check whether size was overestimated and shrink using realloc.
2574 * This is safe since no pointers to newly parsed regexp or its parts
2575 * besides re exist here.
2576 */
2577 if ((size_t)(endPC - re->program) != state.progLength + 1) {
2578 JSRegExp *tmp;
2579 JS_ASSERT((size_t)(endPC - re->program) < state.progLength + 1);
2580 resize = offsetof(JSRegExp, program) + (endPC - re->program);
2581 tmp = (JSRegExp *) JS_realloc(cx, re, resize);
2582 if (tmp)
2583 re = tmp;
2584 }
2585
2586 re->flags = flags;
2587 re->parenCount = state.parenCount;
2588 re->source = str;
2589
2590 out:
2591 JS_ARENA_RELEASE(&cx->tempPool, mark);
2592 return re;
2593 }
2594
2595 JSRegExp *
2596 js_NewRegExpOpt(JSContext *cx, JSString *str, JSString *opt, JSBool flat)
2597 {
2598 uintN flags;
2599 jschar *s;
2600 size_t i, n;
2601 char charBuf[2];
2602
2603 flags = 0;
2604 if (opt) {
2605 JSSTRING_CHARS_AND_LENGTH(opt, s, n);
2606 for (i = 0; i < n; i++) {
2607 #define HANDLE_FLAG(name) \
2608 JS_BEGIN_MACRO \
2609 if (flags & (name)) \
2610 goto bad_flag; \
2611 flags |= (name); \
2612 JS_END_MACRO
2613 switch (s[i]) {
2614 case 'g':
2615 HANDLE_FLAG(JSREG_GLOB);
2616 break;
2617 case 'i':
2618 HANDLE_FLAG(JSREG_FOLD);
2619 break;
2620 case 'm':
2621 HANDLE_FLAG(JSREG_MULTILINE);
2622 break;
2623 case 'y':
2624 HANDLE_FLAG(JSREG_STICKY);
2625 break;
2626 default:
2627 bad_flag:
2628 charBuf[0] = (char)s[i];
2629 charBuf[1] = '\0';
2630 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR,
2631 js_GetErrorMessage, NULL,
2632 JSMSG_BAD_REGEXP_FLAG, charBuf);
2633 return NULL;
2634 }
2635 #undef HANDLE_FLAG
2636 }
2637 }
2638 return js_NewRegExp(cx, NULL, str, flags, flat);
2639 }
2640
2641 /*
2642 * Save the current state of the match - the position in the input
2643 * text as well as the position in the bytecode. The state of any
2644 * parent expressions is also saved (preceding state).
2645 * Contents of parenCount parentheses from parenIndex are also saved.
2646 */
2647 static REBackTrackData *
2648 PushBackTrackState(REGlobalData *gData, REOp op,
2649 jsbytecode *target, REMatchState *x, const jschar *cp,
2650 size_t parenIndex, size_t parenCount)
2651 {
2652 size_t i;
2653 REBackTrackData *result =
2654 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
2655
2656 size_t sz = sizeof(REBackTrackData) +
2657 gData->stateStackTop * sizeof(REProgState) +
2658 parenCount * sizeof(RECapture);
2659
2660 ptrdiff_t btsize = gData->backTrackStackSize;
2661 ptrdiff_t btincr = ((char *)result + sz) -
2662 ((char *)gData->backTrackStack + btsize);
2663
2664 re_debug("\tBT_Push: %lu,%lu",
2665 (unsigned long) parenIndex, (unsigned long) parenCount);
2666
2667 if (btincr > 0) {
2668 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
2669
2670 btincr = JS_ROUNDUP(btincr, btsize);
2671 JS_ARENA_GROW_CAST(gData->backTrackStack, REBackTrackData *,
2672 &gData->cx->regexpPool, btsize, btincr);
2673 if (!gData->backTrackStack) {
2674 js_ReportOutOfScriptQuota(gData->cx);
2675 gData->ok = JS_FALSE;
2676 return NULL;
2677 }
2678 gData->backTrackStackSize = btsize + btincr;
2679 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2680 }
2681 gData->backTrackSP = result;
2682 result->sz = gData->cursz;
2683 gData->cursz = sz;
2684
2685 result->backtrack_op = op;
2686 result->backtrack_pc = target;
2687 result->cp = cp;
2688 result->parenCount = parenCount;
2689 result->parenIndex = parenIndex;
2690
2691 result->saveStateStackTop = gData->stateStackTop;
2692 JS_ASSERT(gData->stateStackTop);
2693 memcpy(result + 1, gData->stateStack,
2694 sizeof(REProgState) * result->saveStateStackTop);
2695
2696 if (parenCount != 0) {
2697 memcpy((char *)(result + 1) +
2698 sizeof(REProgState) * result->saveStateStackTop,
2699 &x->parens[parenIndex],
2700 sizeof(RECapture) * parenCount);
2701 for (i = 0; i != parenCount; i++)
2702 x->parens[parenIndex + i].index = -1;
2703 }
2704
2705 return result;
2706 }
2707
2708
2709 /*
2710 * Consecutive literal characters.
2711 */
2712 #if 0
2713 static REMatchState *
2714 FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
2715 size_t length)
2716 {
2717 size_t i;
2718 if (length > gData->cpend - x->cp)
2719 return NULL;
2720 for (i = 0; i != length; i++) {
2721 if (matchChars[i] != x->cp[i])
2722 return NULL;
2723 }
2724 x->cp += length;
2725 return x;
2726 }
2727 #endif
2728
2729 static JS_ALWAYS_INLINE REMatchState *
2730 FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
2731 size_t length)
2732 {
2733 size_t i;
2734 JS_ASSERT(gData->cpend >= x->cp);
2735 if (length > (size_t)(gData->cpend - x->cp))
2736 return NULL;
2737 for (i = 0; i != length; i++) {
2738 if (upcase(matchChars[i]) != upcase(x->cp[i]))
2739 return NULL;
2740 }
2741 x->cp += length;
2742 return x;
2743 }
2744
2745 /*
2746 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2747 * 2. If E is not a character then go to step 6.
2748 * 3. Let ch be E's character.
2749 * 4. Let A be a one-element RECharSet containing the character ch.
2750 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2751 * 6. E must be an integer. Let n be that integer.
2752 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2753 * 8. Return an internal Matcher closure that takes two arguments, a State x
2754 * and a Continuation c, and performs the following:
2755 * 1. Let cap be x's captures internal array.
2756 * 2. Let s be cap[n].
2757 * 3. If s is undefined, then call c(x) and return its result.
2758 * 4. Let e be x's endIndex.
2759 * 5. Let len be s's length.
2760 * 6. Let f be e+len.
2761 * 7. If f>InputLength, return failure.
2762 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2763 * such that Canonicalize(s[i]) is not the same character as
2764 * Canonicalize(Input [e+i]), then return failure.
2765 * 9. Let y be the State (f, cap).
2766 * 10. Call c(y) and return its result.
2767 */
2768 static REMatchState *
2769 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2770 {
2771 size_t len, i;
2772 const jschar *parenContent;
2773 RECapture *cap = &x->parens[parenIndex];
2774
2775 if (cap->index == -1)
2776 return x;
2777
2778 len = cap->length;
2779 if (x->cp + len > gData->cpend)
2780 return NULL;
2781
2782 parenContent = &gData->cpbegin[cap->index];
2783 if (gData->regexp->flags & JSREG_FOLD) {
2784 for (i = 0; i < len; i++) {
2785 if (upcase(parenContent[i]) != upcase(x->cp[i]))
2786 return NULL;
2787 }
2788 } else {
2789 for (i = 0; i < len; i++) {
2790 if (parenContent[i] != x->cp[i])
2791 return NULL;
2792 }
2793 }
2794 x->cp += len;
2795 return x;
2796 }
2797
2798
2799 /* Add a single character to the RECharSet */
2800 static void
2801 AddCharacterToCharSet(RECharSet *cs, jschar c)
2802 {
2803 uintN byteIndex = (uintN)(c >> 3);
2804 JS_ASSERT(c <= cs->length);
2805 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2806 }
2807
2808
2809 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2810 static void
2811 AddCharacterRangeToCharSet(RECharSet *cs, uintN c1, uintN c2)
2812 {
2813 uintN i;
2814
2815 uintN byteIndex1 = c1 >> 3;
2816 uintN byteIndex2 = c2 >> 3;
2817
2818 JS_ASSERT(c2 <= cs->length && c1 <= c2);
2819
2820 c1 &= 0x7;
2821 c2 &= 0x7;
2822
2823 if (byteIndex1 == byteIndex2) {
2824 cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1;
2825 } else {
2826 cs->u.bits[byteIndex1] |= 0xFF << c1;
2827 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2828 cs->u.bits[i] = 0xFF;
2829 cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2);
2830 }
2831 }
2832
2833 struct CharacterRange {
2834 jschar start;
2835 jschar end;
2836 };
2837
2838 /*
2839 * The following characters are taken from the ECMA-262 standard, section 7.2
2840 * and 7.3, and the Unicode 3 standard, Table 6-1.
2841 */
2842 static const CharacterRange WhiteSpaceRanges[] = {
2843 /* TAB, LF, VT, FF, CR */
2844 { 0x0009, 0x000D },
2845 /* SPACE */
2846 { 0x0020, 0x0020 },
2847 /* NO-BREAK SPACE */
2848 { 0x00A0, 0x00A0 },
2849 /*
2850 * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM
2851 * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE,
2852 * HAIR SPACE, ZERO WIDTH SPACE
2853 */
2854 { 0x2000, 0x200B },
2855 /* LS, PS */
2856 { 0x2028, 0x2029 },
2857 /* NARROW NO-BREAK SPACE */
2858 { 0x202F, 0x202F },
2859 /* IDEOGRAPHIC SPACE */
2860 { 0x3000, 0x3000 }
2861 };
2862
2863 /* ECMA-262 standard, section 15.10.2.6. */
2864 static const CharacterRange WordRanges[] = {
2865 { jschar('0'), jschar('9') },
2866 { jschar('A'), jschar('Z') },
2867 { jschar('_'), jschar('_') },
2868 { jschar('a'), jschar('z') }
2869 };
2870
2871 static void
2872 AddCharacterRanges(RECharSet *charSet,
2873 const CharacterRange *range,
2874 const CharacterRange *end)
2875 {
2876 for (; range < end; ++range)
2877 AddCharacterRangeToCharSet(charSet, range->start, range->end);
2878 }
2879
2880 static void
2881 AddInvertedCharacterRanges(RECharSet *charSet,
2882 const CharacterRange *range,
2883 const CharacterRange *end)
2884 {
2885 uint16 previous = 0;
2886 for (; range < end; ++range) {
2887 AddCharacterRangeToCharSet(charSet, previous, range->start - 1);
2888 previous = range->end + 1;
2889 }
2890 AddCharacterRangeToCharSet(charSet, previous, charSet->length);
2891 }
2892
2893 /* Compile the source of the class into a RECharSet */
2894 static JSBool
2895 ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet)
2896 {
2897 const jschar *src, *end;
2898 JSBool inRange = JS_FALSE;
2899 jschar rangeStart = 0;
2900 uintN byteLength, n;
2901 jschar c, thisCh;
2902 intN nDigits, i;
2903
2904 JS_ASSERT(!charSet->converted);
2905 /*
2906 * Assert that startIndex and length points to chars inside [] inside
2907 * source string.
2908 */
2909 JS_ASSERT(1 <= charSet->u.src.startIndex);
2910 JS_ASSERT(charSet->u.src.startIndex
2911 < JSSTRING_LENGTH(re->source));
2912 JS_ASSERT(charSet->u.src.length <= JSSTRING_LENGTH(re->source)
2913 - 1 - charSet->u.src.startIndex);
2914
2915 charSet->converted = JS_TRUE;
2916 src = JSSTRING_CHARS(re->source) + charSet->u.src.startIndex;
2917 end = src + charSet->u.src.length;
2918 JS_ASSERT(src[-1] == '[');
2919 JS_ASSERT(end[0] == ']');
2920
2921 byteLength = (charSet->length >> 3) + 1;
2922 charSet->u.bits = (uint8 *)JS_malloc(cx, byteLength);
2923 if (!charSet->u.bits) {
2924 JS_ReportOutOfMemory(cx);
2925 return JS_FALSE;
2926 }
2927 memset(charSet->u.bits, 0, byteLength);
2928
2929 if (src == end)
2930 return JS_TRUE;
2931
2932 if (*src == '^') {
2933 JS_ASSERT(charSet->sense == JS_FALSE);
2934 ++src;
2935 } else {
2936 JS_ASSERT(charSet->sense == JS_TRUE);
2937 }
2938
2939 while (src != end) {
2940 switch (*src) {
2941 case '\\':
2942 ++src;
2943 c = *src++;
2944 switch (c) {
2945 case 'b':
2946 thisCh = 0x8;
2947 break;
2948 case 'f':
2949 thisCh = 0xC;
2950 break;
2951 case 'n':
2952 thisCh = 0xA;
2953 break;
2954 case 'r':
2955 thisCh = 0xD;
2956 break;
2957 case 't':
2958 thisCh = 0x9;
2959 break;
2960 case 'v':
2961 thisCh = 0xB;
2962 break;
2963 case 'c':
2964 if (src < end && JS_ISWORD(*src)) {
2965 thisCh = (jschar)(*src++ & 0x1F);
2966 } else {
2967 --src;
2968 thisCh = '\\';
2969 }
2970 break;
2971 case 'x':
2972 nDigits = 2;
2973 goto lexHex;
2974 case 'u':
2975 nDigits = 4;
2976 lexHex:
2977 n = 0;
2978 for (i = 0; (i < nDigits) && (src < end); i++) {
2979 uintN digit;
2980 c = *src++;
2981 if (!isASCIIHexDigit(c, &digit)) {
2982 /*
2983 * Back off to accepting the original '\'
2984 * as a literal
2985 */
2986 src -= i + 1;
2987 n = '\\';
2988 break;
2989 }
2990 n = (n << 4) | digit;
2991 }
2992 thisCh = (jschar)n;
2993 break;
2994 case '0':
2995 case '1':
2996 case '2':
2997 case '3':
2998 case '4':
2999 case '5':
3000 case '6':
3001 case '7':
3002 /*
3003 * This is a non-ECMA extension - decimal escapes (in this
3004 * case, octal!) are supposed to be an error inside class
3005 * ranges, but supported here for backwards compatibility.
3006 */
3007 n = JS7_UNDEC(c);
3008 c = *src;
3009 if ('0' <= c && c <= '7') {
3010 src++;
3011 n = 8 * n + JS7_UNDEC(c);
3012 c = *src;
3013 if ('0' <= c && c <= '7') {
3014 src++;
3015 i = 8 * n + JS7_UNDEC(c);
3016 if (i <= 0377)
3017 n = i;
3018 else
3019 src--;
3020 }
3021 }
3022 thisCh = (jschar)n;
3023 break;
3024
3025 case 'd':
3026 AddCharacterRangeToCharSet(charSet, '0', '9');
3027 continue; /* don't need range processing */
3028 case 'D':
3029 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
3030 AddCharacterRangeToCharSet(charSet,
3031 (jschar)('9' + 1),
3032 (jschar)charSet->length);
3033 continue;
3034 case 's':
3035 AddCharacterRanges(charSet, WhiteSpaceRanges,
3036 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
3037 continue;
3038 case 'S':
3039 AddInvertedCharacterRanges(charSet, WhiteSpaceRanges,
3040 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
3041 continue;
3042 case 'w':
3043 AddCharacterRanges(charSet, WordRanges,
3044 WordRanges + JS_ARRAY_LENGTH(WordRanges));
3045 continue;
3046 case 'W':
3047 AddInvertedCharacterRanges(charSet, WordRanges,
3048 WordRanges + JS_ARRAY_LENGTH(WordRanges));
3049 continue;
3050 default:
3051 thisCh = c;
3052 break;
3053
3054 }
3055 break;
3056
3057 default:
3058 thisCh = *src++;
3059 break;
3060
3061 }
3062 if (inRange) {
3063 if (re->flags & JSREG_FOLD) {
3064 int i;
3065
3066 JS_ASSERT(rangeStart <= thisCh);
3067 for (i = rangeStart; i <= thisCh; i++) {
3068 jschar uch, dch;
3069
3070 AddCharacterToCharSet(charSet, i);
3071 uch = upcase(i);
3072 dch = downcase(i);
3073 if (i != uch)
3074 AddCharacterToCharSet(charSet, uch);
3075 if (i != dch)
3076 AddCharacterToCharSet(charSet, dch);
3077 }
3078 } else {
3079 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
3080 }
3081 inRange = JS_FALSE;
3082 } else {
3083 if (re->flags & JSREG_FOLD) {
3084 AddCharacterToCharSet(charSet, upcase(thisCh));
3085 AddCharacterToCharSet(charSet, downcase(thisCh));
3086 } else {
3087 AddCharacterToCharSet(charSet, thisCh);
3088 }
3089 if (src < end - 1) {
3090 if (*src == '-') {
3091 ++src;
3092 inRange = JS_TRUE;
3093 rangeStart = thisCh;
3094 }
3095 }
3096 }
3097 }
3098 return JS_TRUE;
3099 }
3100
3101 static inline JSBool
3102 MatcherProcessCharSet(REGlobalData *gData, RECharSet *charSet) {
3103 JSBool rv = ProcessCharSet(gData->cx, gData->regexp, charSet);
3104 if (!rv) gData->ok = JS_FALSE;
3105 return rv;
3106 }
3107
3108 void
3109 js_DestroyRegExp(JSContext *cx, JSRegExp *re)
3110 {
3111 if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) {
3112 if (re->classList) {
3113 uintN i;
3114 for (i = 0; i < re->classCount; i++) {
3115 if (re->classList[i].converted)
3116 JS_free(cx, re->classList[i].u.bits);
3117 re->classList[i].u.bits = NULL;
3118 }
3119 JS_free(cx, re->classList);
3120 }
3121 JS_free(cx, re);
3122 }
3123 }
3124
3125 static JSBool
3126 ReallocStateStack(REGlobalData *gData)
3127 {
3128 size_t limit = gData->stateStackLimit;
3129 size_t sz = sizeof(REProgState) * limit;
3130
3131 JS_ARENA_GROW_CAST(gData->stateStack, REProgState *,
3132 &gData->cx->regexpPool, sz, sz);
3133 if (!gData->stateStack) {
3134 js_ReportOutOfScriptQuota(gData->cx);
3135 gData->ok = JS_FALSE;
3136 return JS_FALSE;
3137 }
3138 gData->stateStackLimit = limit + limit;
3139 return JS_TRUE;
3140 }
3141
3142 #define PUSH_STATE_STACK(data) \
3143 JS_BEGIN_MACRO \
3144 ++(data)->stateStackTop; \
3145 if ((data)->stateStackTop == (data)->stateStackLimit && \
3146 !ReallocStateStack((data))) { \
3147 return NULL; \
3148 } \
3149 JS_END_MACRO
3150
3151 /*
3152 * Apply the current op against the given input to see if it's going to match
3153 * or fail. Return false if we don't get a match, true if we do. If updatecp is
3154 * true, then update the current state's cp. Always update startpc to the next
3155 * op.
3156 */
3157 static JS_ALWAYS_INLINE REMatchState *
3158 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
3159 jsbytecode **startpc, JSBool updatecp)
3160 {
3161 REMatchState *result = NULL;
3162 jschar matchCh;
3163 size_t parenIndex;
3164 size_t offset, length, index;
3165 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
3166 jschar *source;
3167 const jschar *startcp = x->cp;
3168 jschar ch;
3169 RECharSet *charSet;
3170
3171 #ifdef REGEXP_DEBUG
3172 const char *opname = reop_names[op];
3173 re_debug("\n%06d: %*s%s", pc - gData->regexp->program,
3174 gData->stateStackTop * 2, "", opname);
3175 #endif
3176 switch (op) {
3177 case REOP_EMPTY:
3178 result = x;
3179 break;
3180 case REOP_BOL:
3181 if (x->cp != gData->cpbegin) {
3182 if (!gData->cx->regExpStatics.multiline &&
3183 !(gData->regexp->flags & JSREG_MULTILINE)) {
3184 break;
3185 }
3186 if (!RE_IS_LINE_TERM(x->cp[-1]))
3187 break;
3188 }
3189 result = x;
3190 break;
3191 case REOP_EOL:
3192 if (x->cp != gData->cpend) {
3193 if (!gData->cx->regExpStatics.multiline &&
3194 !(gData->regexp->flags & JSREG_MULTILINE)) {
3195 break;
3196 }
3197 if (!RE_IS_LINE_TERM(*x->cp))
3198 break;
3199 }
3200 result = x;
3201 break;
3202 case REOP_WBDRY:
3203 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
3204 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
3205 result = x;
3206 }
3207 break;
3208 case REOP_WNONBDRY:
3209 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
3210 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
3211 result = x;
3212 }
3213 break;
3214 case REOP_DOT:
3215 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(