/[jscoverage]/trunk/js/jsregexp.cpp
ViewVC logotype

Contents of /trunk/js/jsregexp.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 507 - (show annotations)
Sun Jan 10 07:23:34 2010 UTC (9 years, 5 months ago) by siliconforks
File size: 199577 byte(s)
Update SpiderMonkey from Firefox 3.6rc1.

1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set sw=4 ts=8 et tw=78:
3 *
4 * ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 *
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
11 *
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
15 * License.
16 *
17 * The Original Code is Mozilla Communicator client code, released
18 * March 31, 1998.
19 *
20 * The Initial Developer of the Original Code is
21 * Netscape Communications Corporation.
22 * Portions created by the Initial Developer are Copyright (C) 1998
23 * the Initial Developer. All Rights Reserved.
24 *
25 * Contributor(s):
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either of the GNU General Public License Version 2 or later (the "GPL"),
29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
41 /*
42 * JS regular expressions, after Perl.
43 */
44 #include <stdlib.h>
45 #include <string.h>
46 #include <stdarg.h>
47 #include "jstypes.h"
48 #include "jsstdint.h"
49 #include "jsarena.h" /* Added by JSIFY */
50 #include "jsutil.h" /* Added by JSIFY */
51 #include "jsapi.h"
52 #include "jsarray.h"
53 #include "jsatom.h"
54 #include "jsbuiltins.h"
55 #include "jscntxt.h"
56 #include "jsversion.h"
57 #include "jsfun.h"
58 #include "jsgc.h"
59 #include "jsinterp.h"
60 #include "jslock.h"
61 #include "jsnum.h"
62 #include "jsobj.h"
63 #include "jsopcode.h"
64 #include "jsregexp.h"
65 #include "jsscan.h"
66 #include "jsscope.h"
67 #include "jsstaticcheck.h"
68 #include "jsstr.h"
69 #include "jsvector.h"
70
71 #include <algorithm>
72
73 #ifdef JS_TRACER
74 #include "jstracer.h"
75 using namespace avmplus;
76 using namespace nanojit;
77 #endif
78
79 typedef enum REOp {
80 #define REOP_DEF(opcode, name) opcode,
81 #include "jsreops.tbl"
82 #undef REOP_DEF
83 REOP_LIMIT /* META: no operator >= to this */
84 } REOp;
85
86 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
87
88 #ifdef REGEXP_DEBUG
89 const char *reop_names[] = {
90 #define REOP_DEF(opcode, name) name,
91 #include "jsreops.tbl"
92 #undef REOP_DEF
93 NULL
94 };
95 #endif
96
97 #ifdef __GNUC__
98 static int
99 re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2)));
100 #endif
101
102 #ifdef REGEXP_DEBUG
103 static int
104 re_debug(const char *fmt, ...)
105 {
106 va_list ap;
107 int retval;
108
109 va_start(ap, fmt);
110 retval = vprintf(fmt, ap);
111 va_end(ap);
112 return retval;
113 }
114
115 static void
116 re_debug_chars(const jschar *chrs, size_t length)
117 {
118 int i = 0;
119
120 printf(" \"");
121 while (*chrs && i++ < length) {
122 putchar((char)*chrs++);
123 }
124 printf("\"");
125 }
126 #else /* !REGEXP_DEBUG */
127 /* This should be optimized to a no-op by our tier-1 compilers. */
128 static int
129 re_debug(const char *fmt, ...)
130 {
131 return 0;
132 }
133
134 static void
135 re_debug_chars(const jschar *chrs, size_t length)
136 {
137 }
138 #endif /* !REGEXP_DEBUG */
139
140 struct RENode {
141 REOp op; /* r.e. op bytecode */
142 RENode *next; /* next in concatenation order */
143 void *kid; /* first operand */
144 union {
145 void *kid2; /* second operand */
146 jsint num; /* could be a number */
147 size_t parenIndex; /* or a parenthesis index */
148 struct { /* or a quantifier range */
149 uintN min;
150 uintN max;
151 JSPackedBool greedy;
152 } range;
153 struct { /* or a character class */
154 size_t startIndex;
155 size_t kidlen; /* length of string at kid, in jschars */
156 size_t index; /* index into class list */
157 uint16 bmsize; /* bitmap size, based on max char code */
158 JSPackedBool sense;
159 } ucclass;
160 struct { /* or a literal sequence */
161 jschar chr; /* of one character */
162 size_t length; /* or many (via the kid) */
163 } flat;
164 struct {
165 RENode *kid2; /* second operand from ALT */
166 jschar ch1; /* match char for ALTPREREQ */
167 jschar ch2; /* ditto, or class index for ALTPREREQ2 */
168 } altprereq;
169 } u;
170 };
171
172 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
173 ((c >= 'a') && (c <= 'z')) )
174 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
175 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
176
177 #define CLASS_CACHE_SIZE 4
178
179 typedef struct CompilerState {
180 JSContext *context;
181 JSTokenStream *tokenStream; /* For reporting errors */
182 const jschar *cpbegin;
183 const jschar *cpend;
184 const jschar *cp;
185 size_t parenCount;
186 size_t classCount; /* number of [] encountered */
187 size_t treeDepth; /* maximum depth of parse tree */
188 size_t progLength; /* estimated bytecode length */
189 RENode *result;
190 size_t classBitmapsMem; /* memory to hold all class bitmaps */
191 struct {
192 const jschar *start; /* small cache of class strings */
193 size_t length; /* since they're often the same */
194 size_t index;
195 } classCache[CLASS_CACHE_SIZE];
196 uint16 flags;
197 } CompilerState;
198
199 typedef struct EmitStateStackEntry {
200 jsbytecode *altHead; /* start of REOP_ALT* opcode */
201 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
202 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
203 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
204 RENode *continueNode; /* original REOP_ALT* node being stacked */
205 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
206 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
207 avoid 16-bit unsigned offset overflow */
208 } EmitStateStackEntry;
209
210 /*
211 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
212 * the getters and setters take the pc of the offset, not of the opcode before
213 * the offset.
214 */
215 #define ARG_LEN 2
216 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
217 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
218 (pc)[1] = (jsbytecode) (arg))
219
220 #define OFFSET_LEN ARG_LEN
221 #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
222 #define GET_OFFSET(pc) GET_ARG(pc)
223
224 /*
225 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
226 * For sanity, we limit it to 2^24 bytes.
227 */
228 #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
229
230 /*
231 * The maximum memory that can be allocated for class bitmaps.
232 * For sanity, we limit it to 2^24 bytes.
233 */
234 #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
235
236 /*
237 * Functions to get size and write/read bytecode that represent small indexes
238 * compactly.
239 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
240 * indicates that the following byte brings more bits to the index. Otherwise
241 * this is the last byte in the index bytecode representing highest index bits.
242 */
243 static size_t
244 GetCompactIndexWidth(size_t index)
245 {
246 size_t width;
247
248 for (width = 1; (index >>= 7) != 0; ++width) { }
249 return width;
250 }
251
252 static JS_ALWAYS_INLINE jsbytecode *
253 WriteCompactIndex(jsbytecode *pc, size_t index)
254 {
255 size_t next;
256
257 while ((next = index >> 7) != 0) {
258 *pc++ = (jsbytecode)(index | 0x80);
259 index = next;
260 }
261 *pc++ = (jsbytecode)index;
262 return pc;
263 }
264
265 static JS_ALWAYS_INLINE jsbytecode *
266 ReadCompactIndex(jsbytecode *pc, size_t *result)
267 {
268 size_t nextByte;
269
270 nextByte = *pc++;
271 if ((nextByte & 0x80) == 0) {
272 /*
273 * Short-circuit the most common case when compact index <= 127.
274 */
275 *result = nextByte;
276 } else {
277 size_t shift = 7;
278 *result = 0x7F & nextByte;
279 do {
280 nextByte = *pc++;
281 *result |= (nextByte & 0x7F) << shift;
282 shift += 7;
283 } while ((nextByte & 0x80) != 0);
284 }
285 return pc;
286 }
287
288 typedef struct RECapture {
289 ptrdiff_t index; /* start of contents, -1 for empty */
290 size_t length; /* length of capture */
291 } RECapture;
292
293 typedef struct REMatchState {
294 const jschar *cp;
295 RECapture parens[1]; /* first of 're->parenCount' captures,
296 allocated at end of this struct */
297 } REMatchState;
298
299 struct REBackTrackData;
300
301 typedef struct REProgState {
302 jsbytecode *continue_pc; /* current continuation data */
303 jsbytecode continue_op;
304 ptrdiff_t index; /* progress in text */
305 size_t parenSoFar; /* highest indexed paren started */
306 union {
307 struct {
308 uintN min; /* current quantifier limits */
309 uintN max;
310 } quantifier;
311 struct {
312 size_t top; /* backtrack stack state */
313 size_t sz;
314 } assertion;
315 } u;
316 } REProgState;
317
318 typedef struct REBackTrackData {
319 size_t sz; /* size of previous stack entry */
320 jsbytecode *backtrack_pc; /* where to backtrack to */
321 jsbytecode backtrack_op;
322 const jschar *cp; /* index in text of match at backtrack */
323 size_t parenIndex; /* start index of saved paren contents */
324 size_t parenCount; /* # of saved paren contents */
325 size_t saveStateStackTop; /* number of parent states */
326 /* saved parent states follow */
327 /* saved paren contents follow */
328 } REBackTrackData;
329
330 #define INITIAL_STATESTACK 100
331 #define INITIAL_BACKTRACK 8000
332
333 typedef struct REGlobalData {
334 JSContext *cx;
335 JSRegExp *regexp; /* the RE in execution */
336 JSBool ok; /* runtime error (out_of_memory only?) */
337 size_t start; /* offset to start at */
338 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
339 const jschar *cpbegin; /* text base address */
340 const jschar *cpend; /* text limit address */
341
342 REProgState *stateStack; /* stack of state of current parents */
343 size_t stateStackTop;
344 size_t stateStackLimit;
345
346 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
347 REBackTrackData *backTrackSP;
348 size_t backTrackStackSize;
349 size_t cursz; /* size of current stack entry */
350 size_t backTrackCount; /* how many times we've backtracked */
351 size_t backTrackLimit; /* upper limit on backtrack states */
352 } REGlobalData;
353
354 /*
355 * 1. If IgnoreCase is false, return ch.
356 * 2. Let u be ch converted to upper case as if by calling
357 * String.prototype.toUpperCase on the one-character string ch.
358 * 3. If u does not consist of a single character, return ch.
359 * 4. Let cu be u's character.
360 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
361 * code point value is less than decimal 128, then return ch.
362 * 6. Return cu.
363 */
364 static JS_ALWAYS_INLINE uintN
365 upcase(uintN ch)
366 {
367 uintN cu;
368
369 JS_ASSERT((uintN) (jschar) ch == ch);
370 if (ch < 128) {
371 if (ch - (uintN) 'a' <= (uintN) ('z' - 'a'))
372 ch -= (uintN) ('a' - 'A');
373 return ch;
374 }
375
376 cu = JS_TOUPPER(ch);
377 return (cu < 128) ? ch : cu;
378 }
379
380 static JS_ALWAYS_INLINE uintN
381 downcase(uintN ch)
382 {
383 JS_ASSERT((uintN) (jschar) ch == ch);
384 if (ch < 128) {
385 if (ch - (uintN) 'A' <= (uintN) ('Z' - 'A'))
386 ch += (uintN) ('a' - 'A');
387 return ch;
388 }
389
390 return JS_TOLOWER(ch);
391 }
392
393 /* Construct and initialize an RENode, returning NULL for out-of-memory */
394 static RENode *
395 NewRENode(CompilerState *state, REOp op)
396 {
397 JSContext *cx;
398 RENode *ren;
399
400 cx = state->context;
401 JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
402 if (!ren) {
403 js_ReportOutOfScriptQuota(cx);
404 return NULL;
405 }
406 ren->op = op;
407 ren->next = NULL;
408 ren->kid = NULL;
409 return ren;
410 }
411
412 /*
413 * Validates and converts hex ascii value.
414 */
415 static JSBool
416 isASCIIHexDigit(jschar c, uintN *digit)
417 {
418 uintN cv = c;
419
420 if (cv < '0')
421 return JS_FALSE;
422 if (cv <= '9') {
423 *digit = cv - '0';
424 return JS_TRUE;
425 }
426 cv |= 0x20;
427 if (cv >= 'a' && cv <= 'f') {
428 *digit = cv - 'a' + 10;
429 return JS_TRUE;
430 }
431 return JS_FALSE;
432 }
433
434
435 typedef struct {
436 REOp op;
437 const jschar *errPos;
438 size_t parenIndex;
439 } REOpData;
440
441 static JSBool
442 ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber,
443 const jschar *arg)
444 {
445 if (state->tokenStream) {
446 return js_ReportCompileErrorNumber(state->context, state->tokenStream,
447 NULL, JSREPORT_UC | flags,
448 errorNumber, arg);
449 }
450 return JS_ReportErrorFlagsAndNumberUC(state->context, flags,
451 js_GetErrorMessage, NULL,
452 errorNumber, arg);
453 }
454
455 static JSBool
456 ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber)
457 {
458 return ReportRegExpErrorHelper(state, flags, errorNumber, NULL);
459 }
460
461 /*
462 * Process the op against the two top operands, reducing them to a single
463 * operand in the penultimate slot. Update progLength and treeDepth.
464 */
465 static JSBool
466 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
467 intN operandSP)
468 {
469 RENode *result;
470
471 switch (opData->op) {
472 case REOP_ALT:
473 result = NewRENode(state, REOP_ALT);
474 if (!result)
475 return JS_FALSE;
476 result->kid = operandStack[operandSP - 2];
477 result->u.kid2 = operandStack[operandSP - 1];
478 operandStack[operandSP - 2] = result;
479
480 if (state->treeDepth == TREE_DEPTH_MAX) {
481 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
482 return JS_FALSE;
483 }
484 ++state->treeDepth;
485
486 /*
487 * Look at both alternates to see if there's a FLAT or a CLASS at
488 * the start of each. If so, use a prerequisite match.
489 */
490 if (((RENode *) result->kid)->op == REOP_FLAT &&
491 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
492 (state->flags & JSREG_FOLD) == 0) {
493 result->op = REOP_ALTPREREQ;
494 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
495 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
496 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
497 JUMP, <end> ... ENDALT */
498 state->progLength += 13;
499 }
500 else
501 if (((RENode *) result->kid)->op == REOP_CLASS &&
502 ((RENode *) result->kid)->u.ucclass.index < 256 &&
503 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
504 (state->flags & JSREG_FOLD) == 0) {
505 result->op = REOP_ALTPREREQ2;
506 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
507 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
508 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
509 JUMP, <end> ... ENDALT */
510 state->progLength += 13;
511 }
512 else
513 if (((RENode *) result->kid)->op == REOP_FLAT &&
514 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
515 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
516 (state->flags & JSREG_FOLD) == 0) {
517 result->op = REOP_ALTPREREQ2;
518 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
519 result->u.altprereq.ch2 =
520 ((RENode *) result->u.kid2)->u.ucclass.index;
521 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
522 JUMP, <end> ... ENDALT */
523 state->progLength += 13;
524 }
525 else {
526 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
527 state->progLength += 7;
528 }
529 break;
530
531 case REOP_CONCAT:
532 result = operandStack[operandSP - 2];
533 while (result->next)
534 result = result->next;
535 result->next = operandStack[operandSP - 1];
536 break;
537
538 case REOP_ASSERT:
539 case REOP_ASSERT_NOT:
540 case REOP_LPARENNON:
541 case REOP_LPAREN:
542 /* These should have been processed by a close paren. */
543 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
544 opData->errPos);
545 return JS_FALSE;
546
547 default:;
548 }
549 return JS_TRUE;
550 }
551
552 /*
553 * Parser forward declarations.
554 */
555 static JSBool ParseTerm(CompilerState *state);
556 static JSBool ParseQuantifier(CompilerState *state);
557 static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues);
558
559 /*
560 * Top-down regular expression grammar, based closely on Perl4.
561 *
562 * regexp: altern A regular expression is one or more
563 * altern '|' regexp alternatives separated by vertical bar.
564 */
565 #define INITIAL_STACK_SIZE 128
566
567 static JSBool
568 ParseRegExp(CompilerState *state)
569 {
570 size_t parenIndex;
571 RENode *operand;
572 REOpData *operatorStack;
573 RENode **operandStack;
574 REOp op;
575 intN i;
576 JSBool result = JS_FALSE;
577
578 intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
579 intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
580
581 /* Watch out for empty regexp */
582 if (state->cp == state->cpend) {
583 state->result = NewRENode(state, REOP_EMPTY);
584 return (state->result != NULL);
585 }
586
587 operatorStack = (REOpData *)
588 state->context->malloc(sizeof(REOpData) * operatorStackSize);
589 if (!operatorStack)
590 return JS_FALSE;
591
592 operandStack = (RENode **)
593 state->context->malloc(sizeof(RENode *) * operandStackSize);
594 if (!operandStack)
595 goto out;
596
597 for (;;) {
598 parenIndex = state->parenCount;
599 if (state->cp == state->cpend) {
600 /*
601 * If we are at the end of the regexp and we're short one or more
602 * operands, the regexp must have the form /x|/ or some such, with
603 * left parentheses making us short more than one operand.
604 */
605 if (operatorSP >= operandSP) {
606 operand = NewRENode(state, REOP_EMPTY);
607 if (!operand)
608 goto out;
609 goto pushOperand;
610 }
611 } else {
612 switch (*state->cp) {
613 case '(':
614 ++state->cp;
615 if (state->cp + 1 < state->cpend &&
616 *state->cp == '?' &&
617 (state->cp[1] == '=' ||
618 state->cp[1] == '!' ||
619 state->cp[1] == ':')) {
620 switch (state->cp[1]) {
621 case '=':
622 op = REOP_ASSERT;
623 /* ASSERT, <next>, ... ASSERTTEST */
624 state->progLength += 4;
625 break;
626 case '!':
627 op = REOP_ASSERT_NOT;
628 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
629 state->progLength += 4;
630 break;
631 default:
632 op = REOP_LPARENNON;
633 break;
634 }
635 state->cp += 2;
636 } else {
637 op = REOP_LPAREN;
638 /* LPAREN, <index>, ... RPAREN, <index> */
639 state->progLength
640 += 2 * (1 + GetCompactIndexWidth(parenIndex));
641 state->parenCount++;
642 if (state->parenCount == 65535) {
643 ReportRegExpError(state, JSREPORT_ERROR,
644 JSMSG_TOO_MANY_PARENS);
645 goto out;
646 }
647 }
648 goto pushOperator;
649
650 case ')':
651 /*
652 * If there's no stacked open parenthesis, throw syntax error.
653 */
654 for (i = operatorSP - 1; ; i--) {
655 if (i < 0) {
656 ReportRegExpError(state, JSREPORT_ERROR,
657 JSMSG_UNMATCHED_RIGHT_PAREN);
658 goto out;
659 }
660 if (operatorStack[i].op == REOP_ASSERT ||
661 operatorStack[i].op == REOP_ASSERT_NOT ||
662 operatorStack[i].op == REOP_LPARENNON ||
663 operatorStack[i].op == REOP_LPAREN) {
664 break;
665 }
666 }
667 /* FALL THROUGH */
668
669 case '|':
670 /* Expected an operand before these, so make an empty one */
671 operand = NewRENode(state, REOP_EMPTY);
672 if (!operand)
673 goto out;
674 goto pushOperand;
675
676 default:
677 if (!ParseTerm(state))
678 goto out;
679 operand = state->result;
680 pushOperand:
681 if (operandSP == operandStackSize) {
682 RENode **tmp;
683 operandStackSize += operandStackSize;
684 tmp = (RENode **)
685 state->context->realloc(operandStack,
686 sizeof(RENode *) * operandStackSize);
687 if (!tmp)
688 goto out;
689 operandStack = tmp;
690 }
691 operandStack[operandSP++] = operand;
692 break;
693 }
694 }
695
696 /* At the end; process remaining operators. */
697 restartOperator:
698 if (state->cp == state->cpend) {
699 while (operatorSP) {
700 --operatorSP;
701 if (!ProcessOp(state, &operatorStack[operatorSP],
702 operandStack, operandSP))
703 goto out;
704 --operandSP;
705 }
706 JS_ASSERT(operandSP == 1);
707 state->result = operandStack[0];
708 result = JS_TRUE;
709 goto out;
710 }
711
712 switch (*state->cp) {
713 case '|':
714 /* Process any stacked 'concat' operators */
715 ++state->cp;
716 while (operatorSP &&
717 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
718 --operatorSP;
719 if (!ProcessOp(state, &operatorStack[operatorSP],
720 operandStack, operandSP)) {
721 goto out;
722 }
723 --operandSP;
724 }
725 op = REOP_ALT;
726 goto pushOperator;
727
728 case ')':
729 /*
730 * If there's no stacked open parenthesis, throw syntax error.
731 */
732 for (i = operatorSP - 1; ; i--) {
733 if (i < 0) {
734 ReportRegExpError(state, JSREPORT_ERROR,
735 JSMSG_UNMATCHED_RIGHT_PAREN);
736 goto out;
737 }
738 if (operatorStack[i].op == REOP_ASSERT ||
739 operatorStack[i].op == REOP_ASSERT_NOT ||
740 operatorStack[i].op == REOP_LPARENNON ||
741 operatorStack[i].op == REOP_LPAREN) {
742 break;
743 }
744 }
745 ++state->cp;
746
747 /* Process everything on the stack until the open parenthesis. */
748 for (;;) {
749 JS_ASSERT(operatorSP);
750 --operatorSP;
751 switch (operatorStack[operatorSP].op) {
752 case REOP_ASSERT:
753 case REOP_ASSERT_NOT:
754 case REOP_LPAREN:
755 operand = NewRENode(state, operatorStack[operatorSP].op);
756 if (!operand)
757 goto out;
758 operand->u.parenIndex =
759 operatorStack[operatorSP].parenIndex;
760 JS_ASSERT(operandSP);
761 operand->kid = operandStack[operandSP - 1];
762 operandStack[operandSP - 1] = operand;
763 if (state->treeDepth == TREE_DEPTH_MAX) {
764 ReportRegExpError(state, JSREPORT_ERROR,
765 JSMSG_REGEXP_TOO_COMPLEX);
766 goto out;
767 }
768 ++state->treeDepth;
769 /* FALL THROUGH */
770
771 case REOP_LPARENNON:
772 state->result = operandStack[operandSP - 1];
773 if (!ParseQuantifier(state))
774 goto out;
775 operandStack[operandSP - 1] = state->result;
776 goto restartOperator;
777 default:
778 if (!ProcessOp(state, &operatorStack[operatorSP],
779 operandStack, operandSP))
780 goto out;
781 --operandSP;
782 break;
783 }
784 }
785 break;
786
787 case '{':
788 {
789 const jschar *errp = state->cp;
790
791 if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) {
792 /*
793 * This didn't even scan correctly as a quantifier, so we should
794 * treat it as flat.
795 */
796 op = REOP_CONCAT;
797 goto pushOperator;
798 }
799
800 state->cp = errp;
801 /* FALL THROUGH */
802 }
803
804 case '+':
805 case '*':
806 case '?':
807 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
808 state->cp);
809 result = JS_FALSE;
810 goto out;
811
812 default:
813 /* Anything else is the start of the next term. */
814 op = REOP_CONCAT;
815 pushOperator:
816 if (operatorSP == operatorStackSize) {
817 REOpData *tmp;
818 operatorStackSize += operatorStackSize;
819 tmp = (REOpData *)
820 state->context->realloc(operatorStack,
821 sizeof(REOpData) * operatorStackSize);
822 if (!tmp)
823 goto out;
824 operatorStack = tmp;
825 }
826 operatorStack[operatorSP].op = op;
827 operatorStack[operatorSP].errPos = state->cp;
828 operatorStack[operatorSP++].parenIndex = parenIndex;
829 break;
830 }
831 }
832 out:
833 if (operatorStack)
834 state->context->free(operatorStack);
835 if (operandStack)
836 state->context->free(operandStack);
837 return result;
838 }
839
840 /*
841 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
842 * its being on the stack, and to propagate errors to its callers.
843 */
844 #define JSREG_FIND_PAREN_COUNT 0x8000
845 #define JSREG_FIND_PAREN_ERROR 0x4000
846
847 /*
848 * Magic return value from FindParenCount and GetDecimalValue, to indicate
849 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
850 * its findMax parameter is non-null.
851 */
852 #define OVERFLOW_VALUE ((uintN)-1)
853
854 static uintN
855 FindParenCount(CompilerState *state)
856 {
857 CompilerState temp;
858 int i;
859
860 if (state->flags & JSREG_FIND_PAREN_COUNT)
861 return OVERFLOW_VALUE;
862
863 /*
864 * Copy state into temp, flag it so we never report an invalid backref,
865 * and reset its members to parse the entire regexp. This is obviously
866 * suboptimal, but GetDecimalValue calls us only if a backref appears to
867 * refer to a forward parenthetical, which is rare.
868 */
869 temp = *state;
870 temp.flags |= JSREG_FIND_PAREN_COUNT;
871 temp.cp = temp.cpbegin;
872 temp.parenCount = 0;
873 temp.classCount = 0;
874 temp.progLength = 0;
875 temp.treeDepth = 0;
876 temp.classBitmapsMem = 0;
877 for (i = 0; i < CLASS_CACHE_SIZE; i++)
878 temp.classCache[i].start = NULL;
879
880 if (!ParseRegExp(&temp)) {
881 state->flags |= JSREG_FIND_PAREN_ERROR;
882 return OVERFLOW_VALUE;
883 }
884 return temp.parenCount;
885 }
886
887 /*
888 * Extract and return a decimal value at state->cp. The initial character c
889 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
890 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
891 * state->flags to discover whether an error occurred under findMax.
892 */
893 static uintN
894 GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
895 CompilerState *state)
896 {
897 uintN value = JS7_UNDEC(c);
898 JSBool overflow = (value > max && (!findMax || value > findMax(state)));
899
900 /* The following restriction allows simpler overflow checks. */
901 JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
902 while (state->cp < state->cpend) {
903 c = *state->cp;
904 if (!JS7_ISDEC(c))
905 break;
906 value = 10 * value + JS7_UNDEC(c);
907 if (!overflow && value > max && (!findMax || value > findMax(state)))
908 overflow = JS_TRUE;
909 ++state->cp;
910 }
911 return overflow ? OVERFLOW_VALUE : value;
912 }
913
914 /*
915 * Calculate the total size of the bitmap required for a class expression.
916 */
917 static JSBool
918 CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
919 const jschar *end)
920 {
921 uintN max = 0;
922 JSBool inRange = JS_FALSE;
923 jschar c, rangeStart = 0;
924 uintN n, digit, nDigits, i;
925
926 target->u.ucclass.bmsize = 0;
927 target->u.ucclass.sense = JS_TRUE;
928
929 if (src == end)
930 return JS_TRUE;
931
932 if (*src == '^') {
933 ++src;
934 target->u.ucclass.sense = JS_FALSE;
935 }
936
937 while (src != end) {
938 JSBool canStartRange = JS_TRUE;
939 uintN localMax = 0;
940
941 switch (*src) {
942 case '\\':
943 ++src;
944 c = *src++;
945 switch (c) {
946 case 'b':
947 localMax = 0x8;
948 break;
949 case 'f':
950 localMax = 0xC;
951 break;
952 case 'n':
953 localMax = 0xA;
954 break;
955 case 'r':
956 localMax = 0xD;
957 break;
958 case 't':
959 localMax = 0x9;
960 break;
961 case 'v':
962 localMax = 0xB;
963 break;
964 case 'c':
965 if (src < end && RE_IS_LETTER(*src)) {
966 localMax = (uintN) (*src++) & 0x1F;
967 } else {
968 --src;
969 localMax = '\\';
970 }
971 break;
972 case 'x':
973 nDigits = 2;
974 goto lexHex;
975 case 'u':
976 nDigits = 4;
977 lexHex:
978 n = 0;
979 for (i = 0; (i < nDigits) && (src < end); i++) {
980 c = *src++;
981 if (!isASCIIHexDigit(c, &digit)) {
982 /*
983 * Back off to accepting the original
984 *'\' as a literal.
985 */
986 src -= i + 1;
987 n = '\\';
988 break;
989 }
990 n = (n << 4) | digit;
991 }
992 localMax = n;
993 break;
994 case 'd':
995 canStartRange = JS_FALSE;
996 if (inRange) {
997 JS_ReportErrorNumber(state->context,
998 js_GetErrorMessage, NULL,
999 JSMSG_BAD_CLASS_RANGE);
1000 return JS_FALSE;
1001 }
1002 localMax = '9';
1003 break;
1004 case 'D':
1005 case 's':
1006 case 'S':
1007 case 'w':
1008 case 'W':
1009 canStartRange = JS_FALSE;
1010 if (inRange) {
1011 JS_ReportErrorNumber(state->context,
1012 js_GetErrorMessage, NULL,
1013 JSMSG_BAD_CLASS_RANGE);
1014 return JS_FALSE;
1015 }
1016 max = 65535;
1017
1018 /*
1019 * If this is the start of a range, ensure that it's less than
1020 * the end.
1021 */
1022 localMax = 0;
1023 break;
1024 case '0':
1025 case '1':
1026 case '2':
1027 case '3':
1028 case '4':
1029 case '5':
1030 case '6':
1031 case '7':
1032 /*
1033 * This is a non-ECMA extension - decimal escapes (in this
1034 * case, octal!) are supposed to be an error inside class
1035 * ranges, but supported here for backwards compatibility.
1036 *
1037 */
1038 n = JS7_UNDEC(c);
1039 c = *src;
1040 if ('0' <= c && c <= '7') {
1041 src++;
1042 n = 8 * n + JS7_UNDEC(c);
1043 c = *src;
1044 if ('0' <= c && c <= '7') {
1045 src++;
1046 i = 8 * n + JS7_UNDEC(c);
1047 if (i <= 0377)
1048 n = i;
1049 else
1050 src--;
1051 }
1052 }
1053 localMax = n;
1054 break;
1055
1056 default:
1057 localMax = c;
1058 break;
1059 }
1060 break;
1061 default:
1062 localMax = *src++;
1063 break;
1064 }
1065
1066 if (inRange) {
1067 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1068 if (rangeStart > localMax) {
1069 JS_ReportErrorNumber(state->context,
1070 js_GetErrorMessage, NULL,
1071 JSMSG_BAD_CLASS_RANGE);
1072 return JS_FALSE;
1073 }
1074 inRange = JS_FALSE;
1075 } else {
1076 if (canStartRange && src < end - 1) {
1077 if (*src == '-') {
1078 ++src;
1079 inRange = JS_TRUE;
1080 rangeStart = (jschar)localMax;
1081 continue;
1082 }
1083 }
1084 if (state->flags & JSREG_FOLD)
1085 rangeStart = localMax; /* one run of the uc/dc loop below */
1086 }
1087
1088 if (state->flags & JSREG_FOLD) {
1089 jschar maxch = localMax;
1090
1091 for (i = rangeStart; i <= localMax; i++) {
1092 jschar uch, dch;
1093
1094 uch = upcase(i);
1095 dch = downcase(i);
1096 maxch = JS_MAX(maxch, uch);
1097 maxch = JS_MAX(maxch, dch);
1098 }
1099 localMax = maxch;
1100 }
1101
1102 if (localMax > max)
1103 max = localMax;
1104 }
1105 target->u.ucclass.bmsize = max;
1106 return JS_TRUE;
1107 }
1108
1109 /*
1110 * item: assertion An item is either an assertion or
1111 * quantatom a quantified atom.
1112 *
1113 * assertion: '^' Assertions match beginning of string
1114 * (or line if the class static property
1115 * RegExp.multiline is true).
1116 * '$' End of string (or line if the class
1117 * static property RegExp.multiline is
1118 * true).
1119 * '\b' Word boundary (between \w and \W).
1120 * '\B' Word non-boundary.
1121 *
1122 * quantatom: atom An unquantified atom.
1123 * quantatom '{' n ',' m '}'
1124 * Atom must occur between n and m times.
1125 * quantatom '{' n ',' '}' Atom must occur at least n times.
1126 * quantatom '{' n '}' Atom must occur exactly n times.
1127 * quantatom '*' Zero or more times (same as {0,}).
1128 * quantatom '+' One or more times (same as {1,}).
1129 * quantatom '?' Zero or one time (same as {0,1}).
1130 *
1131 * any of which can be optionally followed by '?' for ungreedy
1132 *
1133 * atom: '(' regexp ')' A parenthesized regexp (what matched
1134 * can be addressed using a backreference,
1135 * see '\' n below).
1136 * '.' Matches any char except '\n'.
1137 * '[' classlist ']' A character class.
1138 * '[' '^' classlist ']' A negated character class.
1139 * '\f' Form Feed.
1140 * '\n' Newline (Line Feed).
1141 * '\r' Carriage Return.
1142 * '\t' Horizontal Tab.
1143 * '\v' Vertical Tab.
1144 * '\d' A digit (same as [0-9]).
1145 * '\D' A non-digit.
1146 * '\w' A word character, [0-9a-z_A-Z].
1147 * '\W' A non-word character.
1148 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1149 * '\S' A non-whitespace character.
1150 * '\' n A backreference to the nth (n decimal
1151 * and positive) parenthesized expression.
1152 * '\' octal An octal escape sequence (octal must be
1153 * two or three digits long, unless it is
1154 * 0 for the null character).
1155 * '\x' hex A hex escape (hex must be two digits).
1156 * '\u' unicode A unicode escape (must be four digits).
1157 * '\c' ctrl A control character, ctrl is a letter.
1158 * '\' literalatomchar Any character except one of the above
1159 * that follow '\' in an atom.
1160 * otheratomchar Any character not first among the other
1161 * atom right-hand sides.
1162 */
1163 static JSBool
1164 ParseTerm(CompilerState *state)
1165 {
1166 jschar c = *state->cp++;
1167 uintN nDigits;
1168 uintN num, tmp, n, i;
1169 const jschar *termStart;
1170
1171 switch (c) {
1172 /* assertions and atoms */
1173 case '^':
1174 state->result = NewRENode(state, REOP_BOL);
1175 if (!state->result)
1176 return JS_FALSE;
1177 state->progLength++;
1178 return JS_TRUE;
1179 case '$':
1180 state->result = NewRENode(state, REOP_EOL);
1181 if (!state->result)
1182 return JS_FALSE;
1183 state->progLength++;
1184 return JS_TRUE;
1185 case '\\':
1186 if (state->cp >= state->cpend) {
1187 /* a trailing '\' is an error */
1188 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1189 return JS_FALSE;
1190 }
1191 c = *state->cp++;
1192 switch (c) {
1193 /* assertion escapes */
1194 case 'b' :
1195 state->result = NewRENode(state, REOP_WBDRY);
1196 if (!state->result)
1197 return JS_FALSE;
1198 state->progLength++;
1199 return JS_TRUE;
1200 case 'B':
1201 state->result = NewRENode(state, REOP_WNONBDRY);
1202 if (!state->result)
1203 return JS_FALSE;
1204 state->progLength++;
1205 return JS_TRUE;
1206 /* Decimal escape */
1207 case '0':
1208 /* Give a strict warning. See also the note below. */
1209 if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT,
1210 JSMSG_INVALID_BACKREF)) {
1211 return JS_FALSE;
1212 }
1213 doOctal:
1214 num = 0;
1215 while (state->cp < state->cpend) {
1216 c = *state->cp;
1217 if (c < '0' || '7' < c)
1218 break;
1219 state->cp++;
1220 tmp = 8 * num + (uintN)JS7_UNDEC(c);
1221 if (tmp > 0377)
1222 break;
1223 num = tmp;
1224 }
1225 c = (jschar)num;
1226 doFlat:
1227 state->result = NewRENode(state, REOP_FLAT);
1228 if (!state->result)
1229 return JS_FALSE;
1230 state->result->u.flat.chr = c;
1231 state->result->u.flat.length = 1;
1232 state->progLength += 3;
1233 break;
1234 case '1':
1235 case '2':
1236 case '3':
1237 case '4':
1238 case '5':
1239 case '6':
1240 case '7':
1241 case '8':
1242 case '9':
1243 termStart = state->cp - 1;
1244 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1245 if (state->flags & JSREG_FIND_PAREN_ERROR)
1246 return JS_FALSE;
1247 if (num == OVERFLOW_VALUE) {
1248 /* Give a strict mode warning. */
1249 if (!ReportRegExpError(state,
1250 JSREPORT_WARNING | JSREPORT_STRICT,
1251 (c >= '8')
1252 ? JSMSG_INVALID_BACKREF
1253 : JSMSG_BAD_BACKREF)) {
1254 return JS_FALSE;
1255 }
1256
1257 /*
1258 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1259 * error here. However, for compatibility with IE, we treat the
1260 * whole backref as flat if the first character in it is not a
1261 * valid octal character, and as an octal escape otherwise.
1262 */
1263 state->cp = termStart;
1264 if (c >= '8') {
1265 /* Treat this as flat. termStart - 1 is the \. */
1266 c = '\\';
1267 goto asFlat;
1268 }
1269
1270 /* Treat this as an octal escape. */
1271 goto doOctal;
1272 }
1273
1274 /*
1275 * When FindParenCount calls the regex parser recursively (to find
1276 * the number of backrefs) num can be arbitrary and the maximum
1277 * supported number of backrefs does not bound it.
1278 */
1279 JS_ASSERT_IF(!(state->flags & JSREG_FIND_PAREN_COUNT),
1280 1 <= num && num <= 0x10000);
1281 state->result = NewRENode(state, REOP_BACKREF);
1282 if (!state->result)
1283 return JS_FALSE;
1284 state->result->u.parenIndex = num - 1;
1285 state->progLength
1286 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1287 break;
1288 /* Control escape */
1289 case 'f':
1290 c = 0xC;
1291 goto doFlat;
1292 case 'n':
1293 c = 0xA;
1294 goto doFlat;
1295 case 'r':
1296 c = 0xD;
1297 goto doFlat;
1298 case 't':
1299 c = 0x9;
1300 goto doFlat;
1301 case 'v':
1302 c = 0xB;
1303 goto doFlat;
1304 /* Control letter */
1305 case 'c':
1306 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1307 c = (jschar) (*state->cp++ & 0x1F);
1308 } else {
1309 /* back off to accepting the original '\' as a literal */
1310 --state->cp;
1311 c = '\\';
1312 }
1313 goto doFlat;
1314 /* HexEscapeSequence */
1315 case 'x':
1316 nDigits = 2;
1317 goto lexHex;
1318 /* UnicodeEscapeSequence */
1319 case 'u':
1320 nDigits = 4;
1321 lexHex:
1322 n = 0;
1323 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1324 uintN digit;
1325 c = *state->cp++;
1326 if (!isASCIIHexDigit(c, &digit)) {
1327 /*
1328 * Back off to accepting the original 'u' or 'x' as a
1329 * literal.
1330 */
1331 state->cp -= i + 2;
1332 n = *state->cp++;
1333 break;
1334 }
1335 n = (n << 4) | digit;
1336 }
1337 c = (jschar) n;
1338 goto doFlat;
1339 /* Character class escapes */
1340 case 'd':
1341 state->result = NewRENode(state, REOP_DIGIT);
1342 doSimple:
1343 if (!state->result)
1344 return JS_FALSE;
1345 state->progLength++;
1346 break;
1347 case 'D':
1348 state->result = NewRENode(state, REOP_NONDIGIT);
1349 goto doSimple;
1350 case 's':
1351 state->result = NewRENode(state, REOP_SPACE);
1352 goto doSimple;
1353 case 'S':
1354 state->result = NewRENode(state, REOP_NONSPACE);
1355 goto doSimple;
1356 case 'w':
1357 state->result = NewRENode(state, REOP_ALNUM);
1358 goto doSimple;
1359 case 'W':
1360 state->result = NewRENode(state, REOP_NONALNUM);
1361 goto doSimple;
1362 /* IdentityEscape */
1363 default:
1364 state->result = NewRENode(state, REOP_FLAT);
1365 if (!state->result)
1366 return JS_FALSE;
1367 state->result->u.flat.chr = c;
1368 state->result->u.flat.length = 1;
1369 state->result->kid = (void *) (state->cp - 1);
1370 state->progLength += 3;
1371 break;
1372 }
1373 break;
1374 case '[':
1375 state->result = NewRENode(state, REOP_CLASS);
1376 if (!state->result)
1377 return JS_FALSE;
1378 termStart = state->cp;
1379 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1380 for (;;) {
1381 if (state->cp == state->cpend) {
1382 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1383 JSMSG_UNTERM_CLASS, termStart);
1384
1385 return JS_FALSE;
1386 }
1387 if (*state->cp == '\\') {
1388 state->cp++;
1389 if (state->cp != state->cpend)
1390 state->cp++;
1391 continue;
1392 }
1393 if (*state->cp == ']') {
1394 state->result->u.ucclass.kidlen = state->cp - termStart;
1395 break;
1396 }
1397 state->cp++;
1398 }
1399 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1400 if (!state->classCache[i].start) {
1401 state->classCache[i].start = termStart;
1402 state->classCache[i].length = state->result->u.ucclass.kidlen;
1403 state->classCache[i].index = state->classCount;
1404 break;
1405 }
1406 if (state->classCache[i].length ==
1407 state->result->u.ucclass.kidlen) {
1408 for (n = 0; ; n++) {
1409 if (n == state->classCache[i].length) {
1410 state->result->u.ucclass.index
1411 = state->classCache[i].index;
1412 goto claim;
1413 }
1414 if (state->classCache[i].start[n] != termStart[n])
1415 break;
1416 }
1417 }
1418 }
1419 state->result->u.ucclass.index = state->classCount++;
1420
1421 claim:
1422 /*
1423 * Call CalculateBitmapSize now as we want any errors it finds
1424 * to be reported during the parse phase, not at execution.
1425 */
1426 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1427 return JS_FALSE;
1428 /*
1429 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1430 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1431 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1432 */
1433 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1434 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1435 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1436 return JS_FALSE;
1437 }
1438 state->classBitmapsMem += n;
1439 /* CLASS, <index> */
1440 state->progLength
1441 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1442 break;
1443
1444 case '.':
1445 state->result = NewRENode(state, REOP_DOT);
1446 goto doSimple;
1447
1448 case '{':
1449 {
1450 const jschar *errp = state->cp--;
1451 intN err;
1452
1453 err = ParseMinMaxQuantifier(state, JS_TRUE);
1454 state->cp = errp;
1455
1456 if (err < 0)
1457 goto asFlat;
1458
1459 /* FALL THROUGH */
1460 }
1461 case '*':
1462 case '+':
1463 case '?':
1464 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1465 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1466 return JS_FALSE;
1467 default:
1468 asFlat:
1469 state->result = NewRENode(state, REOP_FLAT);
1470 if (!state->result)
1471 return JS_FALSE;
1472 state->result->u.flat.chr = c;
1473 state->result->u.flat.length = 1;
1474 state->result->kid = (void *) (state->cp - 1);
1475 state->progLength += 3;
1476 break;
1477 }
1478 return ParseQuantifier(state);
1479 }
1480
1481 static JSBool
1482 ParseQuantifier(CompilerState *state)
1483 {
1484 RENode *term;
1485 term = state->result;
1486 if (state->cp < state->cpend) {
1487 switch (*state->cp) {
1488 case '+':
1489 state->result = NewRENode(state, REOP_QUANT);
1490 if (!state->result)
1491 return JS_FALSE;
1492 state->result->u.range.min = 1;
1493 state->result->u.range.max = (uintN)-1;
1494 /* <PLUS>, <next> ... <ENDCHILD> */
1495 state->progLength += 4;
1496 goto quantifier;
1497 case '*':
1498 state->result = NewRENode(state, REOP_QUANT);
1499 if (!state->result)
1500 return JS_FALSE;
1501 state->result->u.range.min = 0;
1502 state->result->u.range.max = (uintN)-1;
1503 /* <STAR>, <next> ... <ENDCHILD> */
1504 state->progLength += 4;
1505 goto quantifier;
1506 case '?':
1507 state->result = NewRENode(state, REOP_QUANT);
1508 if (!state->result)
1509 return JS_FALSE;
1510 state->result->u.range.min = 0;
1511 state->result->u.range.max = 1;
1512 /* <OPT>, <next> ... <ENDCHILD> */
1513 state->progLength += 4;
1514 goto quantifier;
1515 case '{': /* balance '}' */
1516 {
1517 intN err;
1518 const jschar *errp = state->cp;
1519
1520 err = ParseMinMaxQuantifier(state, JS_FALSE);
1521 if (err == 0)
1522 goto quantifier;
1523 if (err == -1)
1524 return JS_TRUE;
1525
1526 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1527 return JS_FALSE;
1528 }
1529 default:;
1530 }
1531 }
1532 return JS_TRUE;
1533
1534 quantifier:
1535 if (state->treeDepth == TREE_DEPTH_MAX) {
1536 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1537 return JS_FALSE;
1538 }
1539
1540 ++state->treeDepth;
1541 ++state->cp;
1542 state->result->kid = term;
1543 if (state->cp < state->cpend && *state->cp == '?') {
1544 ++state->cp;
1545 state->result->u.range.greedy = JS_FALSE;
1546 } else {
1547 state->result->u.range.greedy = JS_TRUE;
1548 }
1549 return JS_TRUE;
1550 }
1551
1552 static intN
1553 ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues)
1554 {
1555 uintN min, max;
1556 jschar c;
1557 const jschar *errp = state->cp++;
1558
1559 c = *state->cp;
1560 if (JS7_ISDEC(c)) {
1561 ++state->cp;
1562 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1563 c = *state->cp;
1564
1565 if (!ignoreValues && min == OVERFLOW_VALUE)
1566 return JSMSG_MIN_TOO_BIG;
1567
1568 if (c == ',') {
1569 c = *++state->cp;
1570 if (JS7_ISDEC(c)) {
1571 ++state->cp;
1572 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1573 c = *state->cp;
1574 if (!ignoreValues && max == OVERFLOW_VALUE)
1575 return JSMSG_MAX_TOO_BIG;
1576 if (!ignoreValues && min > max)
1577 return JSMSG_OUT_OF_ORDER;
1578 } else {
1579 max = (uintN)-1;
1580 }
1581 } else {
1582 max = min;
1583 }
1584 if (c == '}') {
1585 state->result = NewRENode(state, REOP_QUANT);
1586 if (!state->result)
1587 return JSMSG_OUT_OF_MEMORY;
1588 state->result->u.range.min = min;
1589 state->result->u.range.max = max;
1590 /*
1591 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1592 * where <max> is written as compact(max+1) to make
1593 * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1594 */
1595 state->progLength += (1 + GetCompactIndexWidth(min)
1596 + GetCompactIndexWidth(max + 1)
1597 +3);
1598 return 0;
1599 }
1600 }
1601
1602 state->cp = errp;
1603 return -1;
1604 }
1605
1606 static JSBool
1607 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
1608 {
1609 ptrdiff_t offset = target - jump;
1610
1611 /* Check that target really points forward. */
1612 JS_ASSERT(offset >= 2);
1613 if ((size_t)offset > OFFSET_MAX)
1614 return JS_FALSE;
1615
1616 jump[0] = JUMP_OFFSET_HI(offset);
1617 jump[1] = JUMP_OFFSET_LO(offset);
1618 return JS_TRUE;
1619 }
1620
1621 /* Copy the charset data from a character class node to the charset list
1622 * in the regexp object. */
1623 static JS_ALWAYS_INLINE RECharSet *
1624 InitNodeCharSet(JSRegExp *re, RENode *node)
1625 {
1626 RECharSet *charSet = &re->classList[node->u.ucclass.index];
1627 charSet->converted = JS_FALSE;
1628 charSet->length = node->u.ucclass.bmsize;
1629 charSet->u.src.startIndex = node->u.ucclass.startIndex;
1630 charSet->u.src.length = node->u.ucclass.kidlen;
1631 charSet->sense = node->u.ucclass.sense;
1632 return charSet;
1633 }
1634
1635 /*
1636 * Generate bytecode for the tree rooted at t using an explicit stack instead
1637 * of recursion.
1638 */
1639 static jsbytecode *
1640 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
1641 jsbytecode *pc, RENode *t)
1642 {
1643 EmitStateStackEntry *emitStateSP, *emitStateStack;
1644 REOp op;
1645
1646 if (treeDepth == 0) {
1647 emitStateStack = NULL;
1648 } else {
1649 emitStateStack =
1650 (EmitStateStackEntry *)
1651 state->context->malloc(sizeof(EmitStateStackEntry) * treeDepth);
1652 if (!emitStateStack)
1653 return NULL;
1654 }
1655 emitStateSP = emitStateStack;
1656 op = t->op;
1657 JS_ASSERT(op < REOP_LIMIT);
1658
1659 for (;;) {
1660 *pc++ = op;
1661 switch (op) {
1662 case REOP_EMPTY:
1663 --pc;
1664 break;
1665
1666 case REOP_ALTPREREQ2:
1667 case REOP_ALTPREREQ:
1668 JS_ASSERT(emitStateSP);
1669 emitStateSP->altHead = pc - 1;
1670 emitStateSP->endTermFixup = pc;
1671 pc += OFFSET_LEN;
1672 SET_ARG(pc, t->u.altprereq.ch1);
1673 pc += ARG_LEN;
1674 SET_ARG(pc, t->u.altprereq.ch2);
1675 pc += ARG_LEN;
1676
1677 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1678 pc += OFFSET_LEN;
1679
1680 emitStateSP->continueNode = t;
1681 emitStateSP->continueOp = REOP_JUMP;
1682 emitStateSP->jumpToJumpFlag = JS_FALSE;
1683 ++emitStateSP;
1684 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1685 t = (RENode *) t->kid;
1686 op = t->op;
1687 JS_ASSERT(op < REOP_LIMIT);
1688 continue;
1689
1690 case REOP_JUMP:
1691 emitStateSP->nextTermFixup = pc; /* offset to following term */
1692 pc += OFFSET_LEN;
1693 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
1694 goto jump_too_big;
1695 emitStateSP->continueOp = REOP_ENDALT;
1696 ++emitStateSP;
1697 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1698 t = (RENode *) t->u.kid2;
1699 op = t->op;
1700 JS_ASSERT(op < REOP_LIMIT);
1701 continue;
1702
1703 case REOP_ENDALT:
1704 /*
1705 * If we already patched emitStateSP->nextTermFixup to jump to
1706 * a nearer jump, to avoid 16-bit immediate offset overflow, we
1707 * are done here.
1708 */
1709 if (emitStateSP->jumpToJumpFlag)
1710 break;
1711
1712 /*
1713 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
1714 * REOP_ENDALT is executed only on successful match of the last
1715 * alternate in a group.
1716 */
1717 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1718 goto jump_too_big;
1719 if (t->op != REOP_ALT) {
1720 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
1721 goto jump_too_big;
1722 }
1723
1724 /*
1725 * If the program is bigger than the REOP_JUMP offset range, then
1726 * we must check for alternates before this one that are part of
1727 * the same group, and fix up their jump offsets to target jumps
1728 * close enough to fit in a 16-bit unsigned offset immediate.
1729 */
1730 if ((size_t)(pc - re->program) > OFFSET_MAX &&
1731 emitStateSP > emitStateStack) {
1732 EmitStateStackEntry *esp, *esp2;
1733 jsbytecode *alt, *jump;
1734 ptrdiff_t span, header;
1735
1736 esp2 = emitStateSP;
1737 alt = esp2->altHead;
1738 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
1739 if (esp->continueOp == REOP_ENDALT &&
1740 !esp->jumpToJumpFlag &&
1741 esp->nextTermFixup + OFFSET_LEN == alt &&
1742 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
1743 ? esp->endTermFixup
1744 : esp->nextTermFixup)) > OFFSET_MAX) {
1745 alt = esp->altHead;
1746 jump = esp->nextTermFixup;
1747
1748 /*
1749 * The span must be 1 less than the distance from
1750 * jump offset to jump offset, so we actually jump
1751 * to a REOP_JUMP bytecode, not to its offset!
1752 */
1753 for (;;) {
1754 JS_ASSERT(jump < esp2->nextTermFixup);
1755 span = esp2->nextTermFixup - jump - 1;
1756 if ((size_t)span <= OFFSET_MAX)
1757 break;
1758 do {
1759 if (--esp2 == esp)
1760 goto jump_too_big;
1761 } while (esp2->continueOp != REOP_ENDALT);
1762 }
1763
1764 jump[0] = JUMP_OFFSET_HI(span);
1765 jump[1] = JUMP_OFFSET_LO(span);
1766
1767 if (esp->continueNode->op != REOP_ALT) {
1768 /*
1769 * We must patch the offset at esp->endTermFixup
1770 * as well, for the REOP_ALTPREREQ{,2} opcodes.
1771 * If we're unlucky and endTermFixup is more than
1772 * OFFSET_MAX bytes from its target, we cheat by
1773 * jumping 6 bytes to the jump whose offset is at
1774 * esp->nextTermFixup, which has the same target.
1775 */
1776 jump = esp->endTermFixup;
1777 header = esp->nextTermFixup - jump;
1778 span += header;
1779 if ((size_t)span > OFFSET_MAX)
1780 span = header;
1781
1782 jump[0] = JUMP_OFFSET_HI(span);
1783 jump[1] = JUMP_OFFSET_LO(span);
1784 }
1785
1786 esp->jumpToJumpFlag = JS_TRUE;
1787 }
1788 }
1789 }
1790 break;
1791
1792 case REOP_ALT:
1793 JS_ASSERT(emitStateSP);
1794 emitStateSP->altHead = pc - 1;
1795 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1796 pc += OFFSET_LEN;
1797 emitStateSP->continueNode = t;
1798 emitStateSP->continueOp = REOP_JUMP;
1799 emitStateSP->jumpToJumpFlag = JS_FALSE;
1800 ++emitStateSP;
1801 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1802 t = (RENode *) t->kid;
1803 op = t->op;
1804 JS_ASSERT(op < REOP_LIMIT);
1805 continue;
1806
1807 case REOP_FLAT:
1808 /*
1809 * Coalesce FLATs if possible and if it would not increase bytecode
1810 * beyond preallocated limit. The latter happens only when bytecode
1811 * size for coalesced string with offset p and length 2 exceeds 6
1812 * bytes preallocated for 2 single char nodes, i.e. when
1813 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
1814 * GetCompactIndexWidth(p) > 4.
1815 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
1816 * nodes strictly decreases bytecode size, the check has to be
1817 * done only for the first coalescing.
1818 */
1819 if (t->kid &&
1820 GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4)
1821 {
1822 while (t->next &&
1823 t->next->op == REOP_FLAT &&
1824 (jschar*)t->kid + t->u.flat.length ==
1825 (jschar*)t->next->kid) {
1826 t->u.flat.length += t->next->u.flat.length;
1827 t->next = t->next->next;
1828 }
1829 }
1830 if (t->kid && t->u.flat.length > 1) {
1831 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
1832 pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin);
1833 pc = WriteCompactIndex(pc, t->u.flat.length);
1834 } else if (t->u.flat.chr < 256) {
1835 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
1836 *pc++ = (jsbytecode) t->u.flat.chr;
1837 } else {
1838 pc[-1] = (state->flags & JSREG_FOLD)
1839 ? REOP_UCFLAT1i
1840 : REOP_UCFLAT1;
1841 SET_ARG(pc, t->u.flat.chr);
1842 pc += ARG_LEN;
1843 }
1844 break;
1845
1846 case REOP_LPAREN:
1847 JS_ASSERT(emitStateSP);
1848 pc = WriteCompactIndex(pc, t->u.parenIndex);
1849 emitStateSP->continueNode = t;
1850 emitStateSP->continueOp = REOP_RPAREN;
1851 ++emitStateSP;
1852 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1853 t = (RENode *) t->kid;
1854 op = t->op;
1855 continue;
1856
1857 case REOP_RPAREN:
1858 pc = WriteCompactIndex(pc, t->u.parenIndex);
1859 break;
1860
1861 case REOP_BACKREF:
1862 pc = WriteCompactIndex(pc, t->u.parenIndex);
1863 break;
1864
1865 case REOP_ASSERT:
1866 JS_ASSERT(emitStateSP);
1867 emitStateSP->nextTermFixup = pc;
1868 pc += OFFSET_LEN;
1869 emitStateSP->continueNode = t;
1870 emitStateSP->continueOp = REOP_ASSERTTEST;
1871 ++emitStateSP;
1872 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1873 t = (RENode *) t->kid;
1874 op = t->op;
1875 continue;
1876
1877 case REOP_ASSERTTEST:
1878 case REOP_ASSERTNOTTEST:
1879 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1880 goto jump_too_big;
1881 break;
1882
1883 case REOP_ASSERT_NOT:
1884 JS_ASSERT(emitStateSP);
1885 emitStateSP->nextTermFixup = pc;
1886 pc += OFFSET_LEN;
1887 emitStateSP->continueNode = t;
1888 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
1889 ++emitStateSP;
1890 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1891 t = (RENode *) t->kid;
1892 op = t->op;
1893 continue;
1894
1895 case REOP_QUANT:
1896 JS_ASSERT(emitStateSP);
1897 if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) {
1898 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
1899 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
1900 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
1901 } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) {
1902 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
1903 } else {
1904 if (!t->u.range.greedy)
1905 pc[-1] = REOP_MINIMALQUANT;
1906 pc = WriteCompactIndex(pc, t->u.range.min);
1907 /*
1908 * Write max + 1 to avoid using size_t(max) + 1 bytes
1909 * for (uintN)-1 sentinel.
1910 */
1911 pc = WriteCompactIndex(pc, t->u.range.max + 1);
1912 }
1913 emitStateSP->nextTermFixup = pc;
1914 pc += OFFSET_LEN;
1915 emitStateSP->continueNode = t;
1916 emitStateSP->continueOp = REOP_ENDCHILD;
1917 ++emitStateSP;
1918 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1919 t = (RENode *) t->kid;
1920 op = t->op;
1921 continue;
1922
1923 case REOP_ENDCHILD:
1924 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1925 goto jump_too_big;
1926 break;
1927
1928 case REOP_CLASS:
1929 if (!t->u.ucclass.sense)
1930 pc[-1] = REOP_NCLASS;
1931 pc = WriteCompactIndex(pc, t->u.ucclass.index);
1932 InitNodeCharSet(re, t);
1933 break;
1934
1935 default:
1936 break;
1937 }
1938
1939 t = t->next;
1940 if (t) {
1941 op = t->op;
1942 } else {
1943 if (emitStateSP == emitStateStack)
1944 break;
1945 --emitStateSP;
1946 t = emitStateSP->continueNode;
1947 op = (REOp) emitStateSP->continueOp;
1948 }
1949 }
1950
1951 cleanup:
1952 if (emitStateStack)
1953 state->context->free(emitStateStack);
1954 return pc;
1955
1956 jump_too_big:
1957 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1958 pc = NULL;
1959 goto cleanup;
1960 }
1961
1962 static JSBool
1963 CompileRegExpToAST(JSContext* cx, JSTokenStream* ts,
1964 JSString* str, uintN flags, CompilerState& state)
1965 {
1966 uintN i;
1967 size_t len;
1968
1969 len = str->length();
1970
1971 state.context = cx;
1972 state.tokenStream = ts;
1973 state.cp = js_UndependString(cx, str);
1974 if (!state.cp)
1975 return JS_FALSE;
1976 state.cpbegin = state.cp;
1977 state.cpend = state.cp + len;
1978 state.flags = flags;
1979 state.parenCount = 0;
1980 state.classCount = 0;
1981 state.progLength = 0;
1982 state.treeDepth = 0;
1983 state.classBitmapsMem = 0;
1984 for (i = 0; i < CLASS_CACHE_SIZE; i++)
1985 state.classCache[i].start = NULL;
1986
1987 if (len != 0 && (flags & JSREG_FLAT)) {
1988 state.result = NewRENode(&state, REOP_FLAT);
1989 if (!state.result)
1990 return JS_FALSE;
1991 state.result->u.flat.chr = *state.cpbegin;
1992 state.result->u.flat.length = len;
1993 state.result->kid = (void *) state.cpbegin;
1994 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
1995 state.progLength += 1 + GetCompactIndexWidth(0)
1996 + GetCompactIndexWidth(len);
1997 return JS_TRUE;
1998 }
1999
2000 return ParseRegExp(&state);
2001 }
2002
2003 #ifdef JS_TRACER
2004 typedef js::Vector<LIns *, 4, js::ContextAllocPolicy> LInsList;
2005
2006 /* Return the cached fragment for the given regexp, or create one. */
2007 static Fragment*
2008 LookupNativeRegExp(JSContext* cx, uint16 re_flags,
2009 const jschar* re_chars, size_t re_length)
2010 {
2011 JSTraceMonitor *tm = &JS_TRACE_MONITOR(cx);
2012 VMAllocator &alloc = *tm->dataAlloc;
2013 REHashMap &table = *tm->reFragments;
2014
2015 REHashKey k(re_length, re_flags, re_chars);
2016 Fragment *frag = table.get(k);
2017
2018 if (!frag) {
2019 verbose_only(
2020 uint32_t profFragID = (js_LogController.lcbits & LC_FragProfile)
2021 ? (++(tm->lastFragID)) : 0;
2022 )
2023 frag = new (alloc) Fragment(0 verbose_only(, profFragID));
2024 frag->lirbuf = tm->reLirBuf;
2025 frag->root = frag;
2026 /*
2027 * Copy the re_chars portion of the hash key into the Allocator, so
2028 * its lifecycle is disconnected from the lifecycle of the
2029 * underlying regexp.
2030 */
2031 k.re_chars = (const jschar*) new (alloc) jschar[re_length];
2032 memcpy((void*) k.re_chars, re_chars, re_length * sizeof(jschar));
2033 table.put(k, frag);
2034 }
2035 return frag;
2036 }
2037
2038 static JSBool
2039 ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet);
2040
2041 /* Utilities for the RegExpNativeCompiler */
2042
2043 namespace {
2044 /*
2045 * An efficient way to simultaneously statically guard that the sizeof(bool) is a
2046 * small power of 2 and take its log2.
2047 */
2048 template <int> struct StaticLog2 {};
2049 template <> struct StaticLog2<1> { static const int result = 0; };
2050 template <> struct StaticLog2<2> { static const int result = 1; };
2051 template <> struct StaticLog2<4> { static const int result = 2; };
2052 template <> struct StaticLog2<8> { static const int result = 3; };
2053 }
2054
2055 /*
2056 * This table allows efficient testing for the ASCII portion of \s during a
2057 * trace. ECMA-262 15.10.2.12 defines the following characters below 128 to be
2058 * whitespace: 0x9 (0), 0xA (10), 0xB (11), 0xC (12), 0xD (13), 0x20 (32). The
2059 * index must be <= 32.
2060 */
2061 static const bool js_ws[] = {
2062 /* 0 1 2 3 4 5 5 7 8 9 */
2063 /* 0 */ false, false, false, false, false, false, false, false, false, true,
2064 /* 1 */ true, true, true, true, false, false, false, false, false, false,
2065 /* 2 */ false, false, false, false, false, false, false, false, false, false,
2066 /* 3 */ false, false, true
2067 };
2068
2069 /* Sets of characters are described in terms of individuals and classes. */
2070 class CharSet {
2071 public:
2072 CharSet() : charEnd(charBuf), classes(0) {}
2073
2074 bool full() { return charEnd == charBuf + BufSize; }
2075
2076 /* Add a single char to the set. */
2077 bool addChar(jschar c)
2078 {
2079 if (full())
2080 return false;
2081 *charEnd++ = c;
2082 return true;
2083 }
2084
2085 enum Class {
2086 LineTerms = 1 << 0, /* Line Terminators (E262 7.3) */
2087 OtherSpace = 1 << 1, /* \s (E262 15.10.2.12) - LineTerms */
2088 Digit = 1 << 2, /* \d (E262 15.10.2.12) */
2089 OtherAlnum = 1 << 3, /* \w (E262 15,10.2.12) - Digit */
2090 Other = 1 << 4, /* all other characters */
2091 All = LineTerms | OtherSpace | Digit | OtherAlnum | Other,
2092
2093 Space = LineTerms | OtherSpace,
2094 AlNum = Digit | OtherAlnum,
2095 Dot = All & ~LineTerms
2096 };
2097
2098 /* Add a set of chars to the set. */
2099 void addClass(Class c) { classes |= c; }
2100
2101 /* Return whether two sets of chars are disjoint. */
2102 bool disjoint(const CharSet &) const;
2103
2104 private:
2105 static bool disjoint(const jschar *beg, const jschar *end, uintN classes);
2106
2107 static const uintN BufSize = 8;
2108 mutable jschar charBuf[BufSize];
2109 jschar *charEnd;
2110 uintN classes;
2111 };
2112
2113 /* Appease the type checker. */
2114 static inline CharSet::Class
2115 operator|(CharSet::Class c1, CharSet::Class c2) {
2116 return (CharSet::Class)(((int)c1) | ((int)c2));
2117 }
2118 static inline CharSet::Class
2119 operator~(CharSet::Class c) {
2120 return (CharSet::Class)(~(int)c);
2121 }
2122
2123 /*
2124 * Return whether the characters in the range [beg, end) fall within any of the
2125 * classes with a bit set in 'classes'.
2126 */
2127 bool
2128 CharSet::disjoint(const jschar *beg, const jschar *end, uintN classes)
2129 {
2130 for (const jschar *p = beg; p != end; ++p) {
2131 if (JS7_ISDEC(*p)) {
2132 if (classes & Digit)
2133 return false;
2134 } else if (JS_ISWORD(*p)) {
2135 if (classes & OtherAlnum)
2136 return false;
2137 } else if (RE_IS_LINE_TERM(*p)) {
2138 if (classes & LineTerms)
2139 return false;
2140 } else if (JS_ISSPACE(*p)) {
2141 if (classes & OtherSpace)
2142 return false;
2143 } else {
2144 if (classes & Other)
2145 return false;
2146 }
2147 }
2148 return true;
2149 }
2150
2151 /*
2152 * Predicate version of the STL's set_intersection. Assumes both ranges are
2153 * sorted and thus runs in linear time.
2154 *
2155 * FIXME: This is a reusable algorithm, perhaps it should be put somewhere.
2156 */
2157 template <class InputIterator1, class InputIterator2>
2158 bool
2159 set_disjoint(InputIterator1 p1, InputIterator1 end1,
2160 InputIterator2 p2, InputIterator2 end2)
2161 {
2162 if (p1 == end1 || p2 == end2)
2163 return true;
2164 while (*p1 != *p2) {
2165 if (*p1 < *p2) {
2166 ++p1;
2167 if (p1 == end1)
2168 return true;
2169 } else if (*p2 < *p1) {
2170 ++p2;
2171 if (p2 == end2)
2172 return true;
2173 }
2174 }
2175 return false;
2176 }
2177
2178 bool
2179 CharSet::disjoint(const CharSet &other) const
2180 {
2181 /* Check overlap between classes. */
2182 if (classes & other.classes)
2183 return false;
2184
2185 /*
2186 * Check char-class overlap. Compare this->charBuf with other.classes and
2187 * vice versa with a loop.
2188 */
2189 if (!disjoint(this->charBuf, this->charEnd, other.classes) ||
2190 !disjoint(other.charBuf, other.charEnd, this->classes))
2191 return false;
2192
2193 /* Check char-char overlap. */
2194 std::sort(charBuf, charEnd);
2195 std::sort(other.charBuf, other.charEnd);
2196 return set_disjoint(charBuf, charEnd, other.charBuf, other.charEnd);
2197 }
2198
2199 /*
2200 * Return true if the given subexpression may match the empty string. The
2201 * conservative answer is |true|. If |next| is true, then the subexpression is
2202 * considered to be |node| followed by the rest of |node->next|. Otherwise, the
2203 * subexpression is considered to be |node| by itself.
2204 */
2205 static bool
2206 mayMatchEmpty(RENode *node, bool next = true)
2207 {
2208 if (!node)
2209 return true;
2210 switch (node->op) {
2211 case REOP_EMPTY: return true;
2212 case REOP_FLAT: return false;
2213 case REOP_CLASS: return false;
2214 case REOP_ALNUM: return false;
2215 case REOP_ALT: return (mayMatchEmpty((RENode *)node->kid) ||
2216 mayMatchEmpty((RENode *)node->u.kid2)) &&
2217 (!next || mayMatchEmpty(node->next));
2218 case REOP_QUANT: return (node->u.range.min == 0 ||
2219 mayMatchEmpty((RENode *)node->kid)) &&
2220 (!next || mayMatchEmpty(node->next));
2221 default: return true;
2222 }
2223 }
2224
2225 /*
2226 * Enumerate the set of characters that may be consumed next by the given
2227 * subexpression in isolation. Return whether the enumeration was successful.
2228 */
2229 static bool
2230 enumerateNextChars(JSContext *cx, RENode *node, CharSet &set)
2231 {
2232 JS_CHECK_RECURSION(cx, return JS_FALSE);
2233
2234 if (!node)
2235 return true;
2236
2237 switch (node->op) {
2238 /* Record as bitflags. */
2239 case REOP_DOT: set.addClass(CharSet::Dot); return true;
2240 case REOP_DIGIT: set.addClass(CharSet::Digit); return true;
2241 case REOP_NONDIGIT: set.addClass(~CharSet::Digit); return true;
2242 case REOP_ALNUM: set.addClass(CharSet::AlNum); return true;
2243 case REOP_NONALNUM: set.addClass(~CharSet::AlNum); return true;
2244 case REOP_SPACE: set.addClass(CharSet::Space); return true;
2245 case REOP_NONSPACE: set.addClass(~CharSet::Space); return true;
2246
2247 /* Record as individual characters. */
2248 case REOP_FLAT:
2249 return set.addChar(node->u.flat.chr);
2250
2251 /* Control structures. */
2252 case REOP_EMPTY:
2253 return true;
2254 case REOP_ALT:
2255 case REOP_ALTPREREQ:
2256 return enumerateNextChars(cx, (RENode *)node->kid, set) &&
2257 enumerateNextChars(cx, (RENode *)node->u.kid2, set) &&
2258 (!mayMatchEmpty(node, false) ||
2259 enumerateNextChars(cx, (RENode *)node->next, set));
2260 case REOP_QUANT:
2261 return enumerateNextChars(cx, (RENode *)node->kid, set) &&
2262 (!mayMatchEmpty(node, false) ||
2263 enumerateNextChars(cx, (RENode *)node->next, set));
2264
2265 /* Arbitrary character classes and oddities. */
2266 default:
2267 return false;
2268 }
2269 }
2270
2271 class RegExpNativeCompiler {
2272 private:
2273 VMAllocator& tempAlloc;
2274 JSContext* cx;
2275 JSRegExp* re;
2276 CompilerState* cs; /* RegExp to compile */
2277 Fragment* fragment;
2278 LirWriter* lir;
2279 #ifdef DEBUG
2280 LirWriter* sanity_filter;
2281 #endif
2282 #ifdef NJ_VERBOSE
2283 LirWriter* verbose_filter;
2284 #endif
2285 LirBufWriter* lirBufWriter; /* for skip */
2286
2287 LIns* state;
2288 LIns* start;
2289 LIns* cpend;
2290
2291 bool outOfMemory() {
2292 return tempAlloc.outOfMemory() || JS_TRACE_MONITOR(cx).dataAlloc->outOfMemory();
2293 }
2294
2295 JSBool isCaseInsensitive() const { return (cs->flags & JSREG_FOLD) != 0; }
2296
2297 void targetCurrentPoint(LIns *ins)
2298 {
2299 ins->setTarget(lir->ins0(LIR_label));
2300 }
2301
2302 void targetCurrentPoint(LInsList &fails)
2303 {
2304 LIns *fail = lir->ins0(LIR_label);
2305 for (size_t i = 0; i < fails.length(); ++i) {
2306 fails[i]->setTarget(fail);
2307 }
2308 fails.clear();
2309 }
2310
2311 /*
2312 * These functions return the new position after their match operation,
2313 * or NULL if there was an error.
2314 */
2315 LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails)
2316 {
2317 return pos;
2318 }
2319
2320 #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC)
2321 /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from
2322 * a pointer which is only 2-byte aligned.
2323 */
2324 #undef USE_DOUBLE_CHAR_MATCH
2325 #else
2326 #define USE_DOUBLE_CHAR_MATCH
2327 #endif
2328
2329 LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails)
2330 {
2331 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_plt, pos, cpend), 0);
2332 if (!fails.append(to_fail))
2333 return NULL;
2334 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, 0);
2335
2336 // Extra characters that need to be compared against when doing folding.
2337 struct extra {
2338 jschar ch;
2339 LIns *match;
2340 };
2341 extra extras[5];
2342 int nextras = 0;
2343
2344 if (cs->flags & JSREG_FOLD) {
2345 ch = JS_TOUPPER(ch);
2346 jschar lch = JS_TOLOWER(ch);
2347
2348 if (ch != lch) {
2349 if (L'A' <= ch && ch <= L'Z') {
2350 // Fast conversion of text character to lower case by OR-ing with 32.
2351 text_ch = lir->ins2(LIR_or, text_ch, lir->insImm(32));
2352 // These ASCII letters have 2 lower-case forms. We put the ASCII one in
2353 // |extras| so it is tested first, because we expect that to be the common
2354 // case. Note that the code points of the non-ASCII forms both have the
2355 // 32 bit set, so it is OK to compare against the OR-32-converted text char.
2356 ch = lch;
2357 if (ch == L'i') {
2358 extras[nextras++].ch = ch;
2359 ch = 0x131;
2360 } else if (ch == L's') {
2361 extras[nextras++].ch = ch;
2362 ch = 0x17f;
2363 }
2364 goto gen;
2365 } else if (0x01c4 <= ch && ch <= 0x1e60) {
2366 // The following group of conditionals handles characters that have 1 or 2
2367 // lower-case forms in addition to JS_TOLOWER(ch).
2368 if (ch <= 0x1f1) { // DZ,LJ,NJ
2369 if (ch == 0x01c4) {
2370 extras[nextras++].ch = 0x01c5;
2371 } else if (ch == 0x01c7) {
2372 extras[nextras++].ch = 0x01c8;
2373 } else if (ch == 0x01ca) {
2374 extras[nextras++].ch = 0x01cb;
2375 } else if (ch == 0x01f1) {
2376 extras[nextras++].ch = 0x01f2;
2377 }
2378 } else if (ch < 0x0392) { // no extra lower-case forms in this range
2379 } else if (ch <= 0x03a6) { // Greek
2380 if (ch == 0x0392) {
2381 extras[nextras++].ch = 0x03d0;
2382 } else if (ch == 0x0395) {
2383 extras[nextras++].ch = 0x03f5;
2384 } else if (ch == 0x0398) {
2385 extras[nextras++].ch = 0x03d1;
2386 } else if (ch == 0x0399) {
2387 extras[nextras++].ch = 0x0345;
2388 extras[nextras++].ch = 0x1fbe;
2389 } else if (ch == 0x039a) {
2390 extras[nextras++].ch = 0x03f0;
2391 } else if (ch == 0x039c) {
2392 extras[nextras++].ch = 0xb5;
2393 } else if (ch == 0x03a0) {
2394 extras[nextras++].ch = 0x03d6;
2395 } else if (ch == 0x03a1) {
2396 extras[nextras++].ch = 0x03f1;
2397 } else if (ch == 0x03a3) {
2398 extras[nextras++].ch = 0x03c2;
2399 } else if (ch == 0x03a6) {
2400 extras[nextras++].ch = 0x03d5;
2401 }
2402 } else if (ch == 0x1e60) { // S with dot above
2403 extras[nextras++].ch = 0x1e9b;
2404 }
2405 }
2406
2407 extras[nextras++].ch = lch;
2408 }
2409 }
2410
2411 gen:
2412 for (int i = 0; i < nextras; ++i) {
2413 LIns *test = lir->ins2(LIR_eq, text_ch, lir->insImm(extras[i].ch));
2414 LIns *branch = lir->insBranch(LIR_jt, test, 0);
2415 extras[i].match = branch;
2416 }
2417
2418 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, text_ch, lir->insImm(ch)), 0)))
2419 return NULL;
2420
2421 for (int i = 0; i < nextras; ++i)
2422 targetCurrentPoint(extras[i].match);
2423 return lir->ins2(LIR_piadd, pos, lir->insImmWord(2));
2424 }
2425
2426 JS_INLINE bool hasCases(jschar ch)
2427 {
2428 return JS_TOLOWER(ch) != JS_TOUPPER(ch);
2429 }
2430
2431 LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, LInsList& fails)
2432 {
2433 #ifdef IS_BIG_ENDIAN
2434 uint32 word = (ch1 << 16) | ch2;
2435 #else
2436 uint32 word = (ch2 << 16) | ch1;
2437 #endif
2438 /*
2439 * Fast case-insensitive test for ASCII letters: convert text
2440 * char to lower case by bit-or-ing in 32 and compare.
2441 */
2442 JSBool useFastCI = JS_FALSE;
2443 union { jschar c[2]; uint32 i; } mask;
2444 if (cs->flags & JSREG_FOLD) {
2445 jschar uch1 = JS_TOUPPER(ch1);
2446 jschar uch2 = JS_TOUPPER(ch2);
2447 JSBool mask1 = (L'A' <= uch1 && uch1 <= L'Z' && uch1 != L'I' && uch1 != L'S');
2448 JSBool mask2 = (L'A' <= uch2 && uch2 <= L'Z' && uch2 != L'I' && uch2 != L'S');
2449 if ((!mask1 && hasCases(ch1)) || (!mask2 && hasCases(ch2))) {
2450 pos = compileFlatSingleChar(ch1, pos, fails);
2451 if (!pos) return NULL;
2452 return compileFlatSingleChar(ch2, pos, fails);
2453 }
2454
2455 mask.c[0] = mask1 ? 0x0020 : 0x0;
2456 mask.c[1] = mask2 ? 0x0020 : 0x0;
2457
2458 if (mask.i) {
2459 word |= mask.i;
2460 useFastCI = JS_TRUE;
2461 }
2462 }
2463
2464 LIns* to_fail = lir->insBranch(LIR_jf,
2465 lir->ins2(LIR_plt,
2466 pos,
2467 lir->ins2(LIR_piadd,
2468 cpend,
2469 lir->insImmWord(-2))),
2470 0);
2471 if (!fails.append(to_fail))
2472 return NULL;
2473 LIns* text_word = lir->insLoad(LIR_ld, pos, 0);
2474 LIns* comp_word = useFastCI ?
2475 lir->ins2(LIR_or, text_word, lir->insImm(mask.i)) :
2476 text_word;
2477 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_word, lir->insImm(word)), 0)))
2478 return NULL;
2479
2480 return lir->ins2(LIR_piadd, pos, lir->insImmWord(4));
2481 }
2482
2483 LIns* compileFlat(RENode *&node, LIns* pos, LInsList& fails)
2484 {
2485 #ifdef USE_DOUBLE_CHAR_MATCH
2486 if (node->u.flat.length == 1) {
2487 if (node->next && node->next->op == REOP_FLAT &&
2488 node->next->u.flat.length == 1) {
2489 pos = compileFlatDoubleChar(node->u.flat.chr,
2490 node->next->u.flat.chr,
2491 pos, fails);
2492 node = node->next;
2493 } else {
2494 pos = compileFlatSingleChar(node->u.flat.chr, pos, fails);
2495 }
2496 return pos;
2497 } else {
2498 size_t i;
2499 for (i = 0; i < node->u.flat.length - 1; i += 2) {
2500 if (outOfMemory())
2501 return 0;
2502 pos = compileFlatDoubleChar(((jschar*) node->kid)[i],
2503 ((jschar*) node->kid)[i+1],
2504 pos, fails);
2505 if (!pos)
2506 return 0;
2507 }
2508 JS_ASSERT(pos != 0);
2509 if (i == node->u.flat.length - 1)
2510 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2511 return pos;
2512 }
2513 #else
2514 if (node->u.flat.length == 1) {
2515 return compileFlatSingleChar(node->u.flat.chr, pos, fails);
2516 } else {
2517 for (size_t i = 0; i < node->u.flat.length; i++) {
2518 if (outOfMemory())
2519 return 0;
2520 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2521 if (!pos)
2522 return 0;
2523 }
2524 return pos;
2525 }
2526 #endif
2527 }
2528
2529 LIns* compileClass(RENode* node, LIns* pos, LInsList& fails)
2530 {
2531 if (!node->u.ucclass.sense)
2532 return JS_FALSE;
2533 /*
2534 * If we share generated native code, we need to make a copy
2535 * of the bitmap because the original regexp's copy is destroyed when
2536 * that regexp is.
2537 */
2538 RECharSet *charSet = &re->classList[node->u.ucclass.index];
2539 size_t bitmapLen = (charSet->length >> 3) + 1;
2540 /* Arbitrary size limit on bitmap. */
2541 if (bitmapLen > 1024)
2542 return NULL;
2543 Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc;
2544 /* The following line allocates charSet.u.bits if successful. */
2545 if (!charSet->converted && !ProcessCharSet(cx, re, charSet))
2546 return NULL;
2547 void* bitmapData = alloc.alloc(bitmapLen);
2548 if (outOfMemory())
2549 return NULL;
2550 memcpy(bitmapData, charSet->u.bits, bitmapLen);
2551
2552 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_plt, pos, cpend), 0);
2553 if (!fails.append(to_fail))
2554 return NULL;
2555 LIns* text_ch = lir->insLoad(LIR_ldcs, pos, 0);
2556 if (!fails.append(lir->insBranch(LIR_jf,
2557 lir->ins2(LIR_le, text_ch, lir->insImm(charSet->length)),
2558 0))) {
2559 return NULL;
2560 }
2561 LIns* byteIndex = lir->ins_i2p(lir->ins2(LIR_rsh, text_ch, lir->insImm(3)));
2562 LIns* bitmap = lir->insImmPtr(bitmapData);
2563 LIns* byte = lir->insLoad(LIR_ldcb, lir->ins2(LIR_piadd, bitmap, byteIndex), (int) 0);
2564 LIns* bitMask = lir->ins2(LIR_lsh, lir->insImm(1),
2565 lir->ins2(LIR_and, text_ch, lir->insImm(0x7)));
2566 LIns* test = lir->ins2(LIR_eq, lir->ins2(LIR_and, byte, bitMask), lir->insImm(0));
2567
2568 LIns* to_next = lir->insBranch(LIR_jt, test, 0);
2569 if (!fails.append(to_next))
2570 return NULL;
2571 return lir->ins2(LIR_piadd, pos, lir->insImmWord(2));
2572 }
2573
2574 /* Factor out common code to index js_alnum. */
2575 LIns *compileTableRead(LIns *chr, const bool *tbl)
2576 {
2577 if (sizeof(bool) != 1) {
2578 LIns *sizeLog2 = lir->insImm(StaticLog2<sizeof(bool)>::result);
2579 chr = lir->ins2(LIR_lsh, chr, sizeLog2);
2580 }
2581 LIns *addr = lir->ins2(LIR_piadd, lir->insImmPtr(tbl), lir->ins_u2p(chr));
2582 return lir->insLoad(LIR_ldcb, addr, 0);
2583 }
2584
2585 /* Compile a builtin character class. */
2586 LIns *compileBuiltinClass(RENode *node, LIns *pos, LInsList &fails)
2587 {
2588 /* All the builtins checked below consume one character. */
2589 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_plt, pos, cpend), 0)))
2590 return NULL;
2591 LIns *chr = lir->insLoad(LIR_ldcs, pos, 0);
2592
2593 switch (node->op) {
2594 case REOP_DOT:
2595 {
2596 /* Accept any character except those in ECMA-262 15.10.2.8. */
2597 LIns *eq1 = lir->ins2(LIR_eq, chr, lir->insImm('\n'));
2598 if (!fails.append(lir->insBranch(LIR_jt, eq1, NULL)))
2599 return NULL;
2600 LIns *eq2 = lir->ins2(LIR_eq, chr, lir->insImm('\r'));
2601 if (!fails.append(lir->insBranch(LIR_jt, eq2, NULL)))
2602 return NULL;
2603 LIns *eq3 = lir->ins2(LIR_eq, chr, lir->insImm(LINE_SEPARATOR));
2604 if (!fails.append(lir->insBranch(LIR_jt, eq3, NULL)))
2605 return NULL;
2606 LIns *eq4 = lir->ins2(LIR_eq, chr, lir->insImm(PARA_SEPARATOR));
2607 if (!fails.append(lir->insBranch(LIR_jt, eq4, NULL)))
2608 return NULL;
2609 break;
2610 }
2611 case REOP_DIGIT:
2612 {
2613 LIns *ge = lir->ins2(LIR_ge, chr, lir->insImm('0'));
2614 if (!fails.append(lir->insBranch(LIR_jf, ge, NULL)))
2615 return NULL;
2616 LIns *le = lir->ins2(LIR_le, chr, lir->insImm('9'));
2617 if (!fails.append(lir->insBranch(LIR_jf, le, NULL)))
2618 return NULL;
2619 break;
2620 }
2621 case REOP_NONDIGIT:
2622 {
2623 /* Use 'and' to give a predictable branch for success path. */
2624 LIns *ge = lir->ins2(LIR_ge, chr, lir->insImm('0'));
2625 LIns *le = lir->ins2(LIR_le, chr, lir->insImm('9'));
2626 LIns *both = lir->ins2(LIR_and, ge, le);
2627 if (!fails.append(lir->insBranch(LIR_jf, lir->ins_eq0(both), NULL)))
2628 return NULL;
2629 break;
2630 }
2631 case REOP_ALNUM:
2632 {
2633 /*
2634 * Compile the condition:
2635 * ((uint)*cp) < 128 && js_alnum[(uint)*cp]
2636 */
2637 LIns *rangeCnd = lir->ins2(LIR_ult, chr, lir->insImm(128));
2638 if (!fails.append(lir->insBranch(LIR_jf, rangeCnd, NULL)))
2639 return NULL;
2640 LIns *tableVal = compileTableRead(chr, js_alnum);
2641 if (!fails.append(lir->insBranch(LIR_jt, lir->ins_eq0(tableVal), NULL)))
2642 return NULL;
2643 break;
2644 }
2645 case REOP_NONALNUM:
2646 {
2647 /*
2648 * Compile the condition:
2649 * ((uint)*cp) >= 128 || !js_alnum[(uint)*cp]
2650 */
2651 LIns *rangeCnd = lir->ins2(LIR_uge, chr, lir->insImm(128));
2652 LIns *rangeBr = lir->insBranch(LIR_jt, rangeCnd, NULL);
2653 LIns *tableVal = compileTableRead(chr, js_alnum);
2654 if (!fails.append(lir->insBranch(LIR_jf, lir->ins_eq0(tableVal), NULL)))
2655 return NULL;
2656 LIns *success = lir->ins0(LIR_label);
2657 rangeBr->setTarget(success);
2658 break;
2659 }
2660 case REOP_SPACE:
2661 case REOP_NONSPACE:
2662 {
2663 /*
2664 * ECMA-262 7.2, 7.3, and 15.10.2.12 define a bunch of Unicode code
2665 * points for whitespace. We optimize here for the common case of
2666 * ASCII characters using a table lookup for the lower block that
2667 * can actually contain spaces. For the rest, use a (more or less)
2668 * binary search to minimize tests.
2669 *
2670 * [0000,0020]: 9, A, B, C, D, 20
2671 * (0020,00A0): none
2672 * [00A0,2000): A0, 1680, 180E
2673 * [2000,200A]: all
2674 * (200A, max): 2028, 2029, 202F, 205F, 3000
2675 */
2676 /* Below 0x20? */
2677 LIns *tableRangeCnd = lir->ins2(LIR_ule, chr, lir->insImm(0x20));
2678 LIns *tableRangeBr = lir->insBranch(LIR_jt, tableRangeCnd, NULL);
2679 /* Fall through means *chr > 0x20. */
2680
2681 /* Handle (0x20,0xA0). */
2682 LIns *asciiCnd = lir->ins2(LIR_ult, chr, lir->insImm(0xA0));
2683 LIns *asciiMissBr = lir->insBranch(LIR_jt, asciiCnd, NULL);
2684 /* Fall through means *chr >= 0xA0. */
2685
2686 /* Partition around [0x2000,0x200A]. */
2687 LIns *belowCnd = lir->ins2(LIR_ult, chr, lir->insImm(0x2000));
2688 LIns *belowBr = lir->insBranch(LIR_jt, belowCnd, NULL);
2689 LIns *aboveCnd = lir->ins2(LIR_ugt, chr, lir->insImm(0x200A));
2690 LIns *aboveBr = lir->insBranch(LIR_jt, aboveCnd, NULL);
2691 LIns *intervalMatchBr = lir->ins2(LIR_j, NULL, NULL);
2692
2693 /* Handle [0xA0,0x2000). */
2694 LIns *belowLbl = lir->ins0(LIR_label);
2695 belowBr->setTarget(belowLbl);
2696 LIns *eq1Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0xA0));
2697 LIns *eq1Br = lir->insBranch(LIR_jt, eq1Cnd, NULL);
2698 LIns *eq2Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x1680));
2699 LIns *eq2Br = lir->insBranch(LIR_jt, eq2Cnd, NULL);
2700 LIns *eq3Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x180E));
2701 LIns *eq3Br = lir->insBranch(LIR_jt, eq3Cnd, NULL);
2702 LIns *belowMissBr = lir->ins2(LIR_j, NULL, NULL);
2703
2704 /* Handle (0x200A, max). */
2705 LIns *aboveLbl = lir->ins0(LIR_label);
2706 aboveBr->setTarget(aboveLbl);
2707 LIns *eq4Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x2028));
2708 LIns *eq4Br = lir->insBranch(LIR_jt, eq4Cnd, NULL);
2709 LIns *eq5Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x2029));
2710 LIns *eq5Br = lir->insBranch(LIR_jt, eq5Cnd, NULL);
2711 LIns *eq6Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x202F));
2712 LIns *eq6Br = lir->insBranch(LIR_jt, eq6Cnd, NULL);
2713 LIns *eq7Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x205F));
2714 LIns *eq7Br = lir->insBranch(LIR_jt, eq7Cnd, NULL);
2715 LIns *eq8Cnd = lir->ins2(LIR_eq, chr, lir->insImm(0x3000));
2716 LIns *eq8Br = lir->insBranch(LIR_jt, eq8Cnd, NULL);
2717 LIns *aboveMissBr = lir->ins2(LIR_j, NULL, NULL);
2718
2719 /* Handle [0,0x20]. */
2720 LIns *tableLbl = lir->ins0(LIR_label);
2721 tableRangeBr->setTarget(tableLbl);
2722 LIns *tableVal = compileTableRead(chr, js_ws);
2723 LIns *tableCnd = lir->ins_eq0(tableVal);
2724 LIns *tableMatchBr = lir->insBranch(LIR_jf, tableCnd, NULL);
2725
2726 /* Collect misses. */
2727 LIns *missLbl = lir->ins0(LIR_label);
2728 asciiMissBr->setTarget(missLbl);
2729 belowMissBr->setTarget(missLbl);
2730 aboveMissBr->setTarget(missLbl);
2731 LIns *missBr = lir->ins2(LIR_j, NULL, NULL);
2732 if (node->op == REOP_SPACE) {
2733 if (!fails.append(missBr))
2734 return NULL;
2735 }
2736
2737 /* Collect matches. */
2738 LIns *matchLbl = lir->ins0(LIR_label);
2739 intervalMatchBr->setTarget(matchLbl);
2740 tableMatchBr->setTarget(matchLbl);
2741 eq1Br->setTarget(matchLbl); eq2Br->setTarget(matchLbl);
2742 eq3Br->setTarget(matchLbl); eq4Br->setTarget(matchLbl);
2743 eq5Br->setTarget(matchLbl); eq6Br->setTarget(matchLbl);
2744 eq7Br->setTarget(matchLbl); eq8Br->setTarget(matchLbl);
2745 if (node->op == REOP_NONSPACE) {
2746 LIns *matchBr = lir->ins2(LIR_j, NULL, NULL);
2747 if (!fails.append(matchBr))
2748 return NULL;
2749 }
2750 /* Fall through means match == success. */
2751
2752 /* Collect successes to fall through. */
2753 LIns *success = lir->ins0(LIR_label);
2754 if (node->op == REOP_NONSPACE)
2755 missBr->setTarget(success);
2756 break;
2757 }
2758 default:
2759 return NULL;
2760 }
2761
2762 return lir->ins2(LIR_piadd, pos, lir->insImmWord(2));
2763 }
2764
2765 LIns *compileAlt(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2766 {
2767 RENode *leftRe = (RENode *)node->kid, *rightRe = (RENode *)node->u.kid2;
2768
2769 /*
2770 * If the RE continues after the alternative, we need to ensure that no
2771 * backtracking is required. Recursive calls to compileNode will fail
2772 * on capturing parens, so the only thing we have to check here is that,
2773 * if the left subexpression matches, we can keep going without later
2774 * deciding we need to try the right subexpression.
2775 */
2776 if (!atEnd) {
2777 /*
2778 * If there is no character overlap between left and right, then
2779 * there is only one possible path through the alternative.
2780 */
2781 CharSet leftSet, rightSet;
2782 if (!enumerateNextChars(cx, leftRe, leftSet) ||
2783 !enumerateNextChars(cx, rightRe, rightSet) ||
2784 !leftSet.disjoint(rightSet))
2785 return NULL;
2786
2787 /*
2788 * If there is an empty path through either subexpression, the above
2789 * check is incomplete; we need to include |node->next| as well.
2790 */
2791 bool epsLeft = mayMatchEmpty(leftRe),
2792 epsRight = mayMatchEmpty(rightRe);
2793 if (epsRight && epsLeft) {
2794 return NULL;
2795 } else if (epsLeft || epsRight) {
2796 CharSet nextSet;
2797 if (!enumerateNextChars(cx, node->next, nextSet) ||
2798 (epsLeft && !nextSet.disjoint(rightSet)) ||
2799 (epsRight && !nextSet.disjoint(leftSet))) {
2800 return NULL;
2801 }
2802 }
2803 }
2804
2805 /* Try left branch. */
2806 LInsList kidFails(cx);
2807 LIns *branchEnd = compileNode(leftRe, pos, atEnd, kidFails);
2808 if (!branchEnd)
2809 return NULL;
2810
2811 /*
2812 * Since there are no phis, simulate by writing to and reading from
2813 * memory (REGlobalData::stateStack, since it is unused).
2814 */
2815 lir->insStorei(branchEnd, state,
2816 offsetof(REGlobalData, stateStack));
2817 LIns *leftSuccess = lir->ins2(LIR_j, NULL, NULL);
2818
2819 /* Try right branch. */
2820 targetCurrentPoint(kidFails);
2821 if (!(branchEnd = compileNode(rightRe, pos, atEnd, fails)))
2822 return NULL;
2823 lir->insStorei(branchEnd, state,
2824 offsetof(REGlobalData, stateStack));
2825
2826 /* Land success on the left branch. */
2827 targetCurrentPoint(leftSuccess);
2828 return addName(fragment->lirbuf,
2829 lir->insLoad(LIR_ldp, state,
2830 offsetof(REGlobalData, stateStack)),
2831 "pos");
2832 }
2833
2834 LIns *compileOpt(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2835 {
2836 /*
2837 * Since there are no phis, simulate by writing to and reading from
2838 * memory (REGlobalData::stateStack, since it is unused).
2839 */
2840 lir->insStorei(pos, state, offsetof(REGlobalData, stateStack));
2841
2842 /* Try ? body. */
2843 LInsList kidFails(cx);
2844 if (!(pos = compileNode(node, pos, atEnd, kidFails)))
2845 return NULL;
2846 lir->insStorei(pos, state, offsetof(REGlobalData, stateStack));
2847
2848 /* Join success and failure and get new position. */
2849 targetCurrentPoint(kidFails);
2850 pos = addName(fragment->lirbuf,
2851 lir->insLoad(LIR_ldp, state,
2852 offsetof(REGlobalData, stateStack)),
2853 "pos");
2854
2855 return pos;
2856 }
2857
2858 LIns *compileQuant(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2859 {
2860 /* Only support greedy *, +, ?. */
2861 if (!node->u.range.greedy ||
2862 node->u.range.min > 1 ||
2863 (node->u.range.max > 1 && node->u.range.max < (uintN)-1)) {
2864 return NULL;
2865 }
2866
2867 RENode *bodyRe = (RENode *)node->kid;
2868
2869 /*
2870 * If the RE continues after the alternative, we need to ensure that no
2871 * backtracking is required. Recursive calls to compileNode will fail
2872 * on capturing parens, so the only thing we have to check here is that,
2873 * if the quantifier body matches, we can continue matching the body
2874 * without later deciding we need to undo the body matches.
2875 */
2876 if (!atEnd) {
2877 /*
2878 * If there is no character overlap between the body and
2879 * |node->next|, then all possible body matches are used.
2880 */
2881 CharSet bodySet, nextSet;
2882 if (!enumerateNextChars(cx, bodyRe, bodySet) ||
2883 !enumerateNextChars(cx, node->next, nextSet) ||
2884 !bodySet.disjoint(nextSet)) {
2885 return NULL;
2886 }
2887 }
2888
2889 /* Fork off ? and {1,1}. */
2890 if (node->u.range.max == 1) {
2891 if (node->u.range.min == 1)
2892 return compileNode(bodyRe, pos, atEnd, fails);
2893 else
2894 return compileOpt(bodyRe, pos, atEnd, fails);
2895 }
2896
2897 /* For +, compile a copy of the body where failure is real failure. */
2898 if (node->u.range.min == 1) {
2899 if (!(pos = compileNode(bodyRe, pos, atEnd, fails)))
2900 return NULL;
2901 }
2902
2903 /*
2904 * Since there are no phis, simulate by writing to and reading from
2905 * memory (REGlobalData::stateStack, since it is unused).
2906 */
2907 lir->insStorei(pos, state, offsetof(REGlobalData, stateStack));
2908
2909 /* Begin iteration: load loop variables. */
2910 LIns *loopTop = lir->ins0(LIR_label);
2911 LIns *iterBegin = addName(fragment->lirbuf,
2912 lir->insLoad(LIR_ldp, state,
2913 offsetof(REGlobalData, stateStack)),
2914 "pos");
2915
2916 /* Match quantifier body. */
2917 LInsList kidFails(cx);
2918 LIns *iterEnd = compileNode(bodyRe, iterBegin, atEnd, kidFails);
2919 if (!iterEnd)
2920 return NULL;
2921
2922 /*
2923 * If there is an epsilon path through the body then, when it is taken,
2924 * we need to abort the loop or else we will loop forever.
2925 */
2926 if (mayMatchEmpty(bodyRe)) {
2927 LIns *eqCnd = lir->ins2(LIR_peq, iterBegin, iterEnd);
2928 if (!kidFails.append(lir->insBranch(LIR_jt, eqCnd, NULL)))
2929 return NULL;
2930 }
2931
2932 /* End iteration: store loop variables, increment, jump */
2933 lir->insStorei(iterEnd, state, offsetof(REGlobalData, stateStack));
2934 lir->ins2(LIR_j, NULL, loopTop);
2935
2936 /*
2937 * Using '+' as branch, the intended control flow is:
2938 *
2939 * ...
2940 * A -> |
2941 * |<---.
2942 * B -> | |
2943 * +--. |
2944 * C -> | | |
2945 * +--. |
2946 * D -> | | |
2947 * +--|-'
2948 * X -> | |
2949 * |<-'
2950 * E -> |
2951 * ...
2952 *
2953 * We are currently at point X. Since the regalloc makes a single,
2954 * linear, backwards sweep over the IR (going from E to A), point X
2955 * must tell the regalloc what LIR insns are live at the end of D.
2956 * Thus, we need to report *all* insns defined *before* the end of D
2957 * that may be used *after* D. This means insns defined in A, B, C, or
2958 * D and used in B, C, D, or E. Since insns in B, C, and D are
2959 * conditionally executed, and we (currently) don't have real phi
2960 * nodes, we need only consider insns defined in A and used in E.
2961 */
2962 lir->ins1(LIR_live, state);
2963 lir->ins1(LIR_live, cpend);
2964 lir->ins1(LIR_live, start);
2965
2966 /* After the loop: reload 'pos' from memory and continue. */
2967 targetCurrentPoint(kidFails);
2968 return iterBegin;
2969 }
2970
2971 /*
2972 * Compile the regular expression rooted at 'node'. Return 0 on failed
2973 * compilation. Otherwise, generate code that falls through on success (the
2974 * returned LIns* is the current 'pos') and jumps to the end on failure (by
2975 * adding the guard LIns to 'fails').
2976 */
2977 LIns *compileNode(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2978 {
2979 for (; pos && node; node = node->next) {
2980 if (outOfMemory())
2981 return NULL;
2982
2983 bool childNextIsEnd = atEnd && !node->next;
2984
2985 switch (node->op) {
2986 case REOP_EMPTY:
2987 pos = compileEmpty(node, pos, fails);
2988 break;
2989 case REOP_FLAT:
2990 pos = compileFlat(node, pos, fails);
2991 break;
2992 case REOP_ALT:
2993 case REOP_ALTPREREQ:
2994 pos = compileAlt(node, pos, childNextIsEnd, fails);
2995 break;
2996 case REOP_QUANT:
2997 pos = compileQuant(node, pos, childNextIsEnd, fails);
2998 break;
2999 case REOP_CLASS:
3000 pos = compileClass(node, pos, fails);
3001 break;
3002 case REOP_DOT:
3003 case REOP_DIGIT:
3004 case REOP_NONDIGIT:
3005 case REOP_ALNUM:
3006 case REOP_NONALNUM:
3007 case REOP_SPACE:
3008 case REOP_NONSPACE:
3009 pos = compileBuiltinClass(node, pos, fails);
3010 break;
3011 default:
3012 return NULL;
3013 }
3014 }
3015 return pos;
3016 }
3017
3018 /*
3019 * This function kicks off recursive compileNode compilation, finishes the
3020 * success path, and lets the failed-match path fall through.
3021 */
3022 bool compileRootNode(RENode *root, LIns *pos, LIns *anchorFail)
3023 {
3024 /* Compile the regular expression body. */
3025 LInsList fails(cx);
3026 pos = compileNode(root, pos, true, fails);
3027 if (!pos)
3028 return false;
3029
3030 /* Fall-through from compileNode means success. */
3031 lir->insStorei(pos, state, offsetof(REGlobalData, stateStack));
3032 lir->ins0(LIR_regfence);
3033 lir->ins1(LIR_ret, lir->insImm(1));
3034
3035 /* Stick return here so we don't have to jump over it every time. */
3036 if (anchorFail) {
3037 targetCurrentPoint(anchorFail);
3038 lir->ins0(LIR_regfence);
3039 lir->ins1(LIR_ret, lir->insImm(0));
3040 }
3041
3042 /* Target failed matches. */
3043 targetCurrentPoint(fails);
3044 return true;
3045 }
3046
3047 /* Compile a regular expressions that can only match on the first char. */
3048 bool compileSticky(RENode *root, LIns *start)
3049 {
3050 if (!compileRootNode(root, start, NULL))
3051 return false;
3052
3053 /* Failed to match on first character, so fail whole match. */
3054 lir->ins0(LIR_regfence);
3055 lir->ins1(LIR_ret, lir->insImm(0));
3056 return !outOfMemory();
3057 }
3058
3059 /* Compile normal regular expressions that can match starting at any char. */
3060 bool compileAnchoring(RENode *root, LIns *start)
3061 {
3062 /* Guard outer anchoring loop. Use <= to allow empty regexp match. */
3063 LIns *anchorFail = lir->insBranch(LIR_jf, lir->ins2(LIR_ple, start, cpend), 0);
3064
3065 if (!compileRootNode(root, start, anchorFail))
3066 return false;
3067
3068 /* Outer loop increment. */
3069 lir->insStorei(lir->ins2(LIR_piadd, start, lir->insImmWord(2)), state,
3070 offsetof(REGlobalData, skipped));
3071
3072 return !outOfMemory();
3073 }
3074
3075 inline LIns*
3076 addName(LirBuffer* lirbuf, LIns* ins, const char* name)
3077 {
3078 #ifdef NJ_VERBOSE
3079 debug_only_stmt(lirbuf->names->addName(ins, name);)
3080 #endif
3081 return ins;
3082 }
3083
3084 /*
3085 * Insert the side exit and guard record for a compiled regexp. Most
3086 * of the fields are not used. The important part is the regexp source
3087 * and flags, which we use as the fragment lookup key.
3088 */
3089 GuardRecord* insertGuard(LIns* loopLabel, const jschar* re_chars, size_t re_length)
3090 {
3091 if (loopLabel) {
3092 lir->insBranch(LIR_j, NULL, loopLabel);
3093 LirBuffer* lirbuf = fragment->lirbuf;
3094 lir->ins1(LIR_live, lirbuf->state);
3095 lir->ins1(LIR_live, lirbuf->param1);
3096 }
3097
3098 Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc;
3099
3100 size_t len = (sizeof(GuardRecord) +
3101 sizeof(VMSideExit) +
3102 (re_length-1) * sizeof(jschar));
3103 GuardRecord* guard = (GuardRecord *) alloc.alloc(len);
3104 memset(guard, 0, len);
3105 VMSideExit* exit = (VMSideExit*)(guard+1);
3106 guard->exit = exit;
3107 guard->exit->target = fragment;
3108 fragment->lastIns = lir->insGuard(LIR_x, NULL, guard);
3109 // guard->profCount is memset'd to zero
3110 verbose_only(
3111 guard->profGuardID = fragment->guardNumberer++;
3112 guard->nextInFrag = fragment->guardsForFrag;
3113 fragment->guardsForFrag = guard;
3114 )
3115 return guard;
3116 }
3117
3118 public:
3119 RegExpNativeCompiler(JSContext* cx, JSRegExp* re, CompilerState* cs, Fragment* fragment)
3120 : tempAlloc(*JS_TRACE_MONITOR(cx).reTempAlloc), cx(cx),
3121 re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL) { }
3122
3123 ~RegExpNativeCompiler() {
3124 /* Purge the tempAlloc used during recording. */
3125 tempAlloc.reset();
3126 JS_TRACE_MONITOR(cx).reLirBuf->clear();
3127 }
3128
3129 JSBool compile()
3130 {
3131 GuardRecord* guard = NULL;
3132 const jschar* re_chars;
3133 size_t re_length;
3134 JSTraceMonitor* tm = &JS_TRACE_MONITOR(cx);
3135 Assembler *assm = tm->assembler;
3136 LIns* loopLabel = NULL;
3137
3138 if (outOfMemory() || js_OverfullJITCache(tm))
3139 return JS_FALSE;
3140
3141 re->source->getCharsAndLength(re_chars, re_length);
3142 /*
3143 * If the regexp is too long nanojit will assert when we
3144 * try to insert the guard record.
3145 */
3146 if (re_length > 1024) {
3147 re->flags |= JSREG_NOCOMPILE;
3148 return JS_FALSE;
3149 }
3150
3151 /* At this point we have an empty fragment. */
3152 LirBuffer* lirbuf = fragment->lirbuf;
3153 if (outOfMemory())
3154 goto fail;
3155 /* FIXME Use bug 463260 smart pointer when available. */
3156 lir = lirBufWriter = new LirBu