Publishing SLRE regex - as a library rather than program as is the case for the existing publish

Dependents:   EMBEDDED_CW2 EMBEDDED_CW2_Final USBSerialIORemotizer

Committer:
andrewbonney
Date:
Fri Apr 08 14:41:11 2011 +0000
Revision:
0:bc6b8c94c371

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
andrewbonney 0:bc6b8c94c371 1 /*
andrewbonney 0:bc6b8c94c371 2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
andrewbonney 0:bc6b8c94c371 3 * All rights reserved
andrewbonney 0:bc6b8c94c371 4 *
andrewbonney 0:bc6b8c94c371 5 * "THE BEER-WARE LICENSE" (Revision 42):
andrewbonney 0:bc6b8c94c371 6 * Sergey Lyubka wrote this file. As long as you retain this notice you
andrewbonney 0:bc6b8c94c371 7 * can do whatever you want with this stuff. If we meet some day, and you think
andrewbonney 0:bc6b8c94c371 8 * this stuff is worth it, you can buy me a beer in return.
andrewbonney 0:bc6b8c94c371 9 */
andrewbonney 0:bc6b8c94c371 10
andrewbonney 0:bc6b8c94c371 11 #include <stdio.h>
andrewbonney 0:bc6b8c94c371 12 #include <assert.h>
andrewbonney 0:bc6b8c94c371 13 #include <ctype.h>
andrewbonney 0:bc6b8c94c371 14 #include <stdlib.h>
andrewbonney 0:bc6b8c94c371 15 #include <string.h>
andrewbonney 0:bc6b8c94c371 16 #include <errno.h>
andrewbonney 0:bc6b8c94c371 17
andrewbonney 0:bc6b8c94c371 18 #include "slre.h"
andrewbonney 0:bc6b8c94c371 19
andrewbonney 0:bc6b8c94c371 20 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
andrewbonney 0:bc6b8c94c371 21 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
andrewbonney 0:bc6b8c94c371 22
andrewbonney 0:bc6b8c94c371 23 static struct {
andrewbonney 0:bc6b8c94c371 24 const char *name;
andrewbonney 0:bc6b8c94c371 25 int narg;
andrewbonney 0:bc6b8c94c371 26 const char *flags;
andrewbonney 0:bc6b8c94c371 27 } opcodes[] = {
andrewbonney 0:bc6b8c94c371 28 {"END", 0, ""}, /* End of code block or program */
andrewbonney 0:bc6b8c94c371 29 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
andrewbonney 0:bc6b8c94c371 30 {"ANY", 0, ""}, /* Match any character, "." */
andrewbonney 0:bc6b8c94c371 31 {"EXACT", 2, "d"}, /* Match exact string */
andrewbonney 0:bc6b8c94c371 32 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
andrewbonney 0:bc6b8c94c371 33 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
andrewbonney 0:bc6b8c94c371 34 {"OPEN ", 1, "i"}, /* Capture start, "(" */
andrewbonney 0:bc6b8c94c371 35 {"CLOSE", 1, "i"}, /* Capture end, ")" */
andrewbonney 0:bc6b8c94c371 36 {"BOL", 0, ""}, /* Beginning of string, "^" */
andrewbonney 0:bc6b8c94c371 37 {"EOL", 0, ""}, /* End of string, "$" */
andrewbonney 0:bc6b8c94c371 38 {"STAR", 1, "o"}, /* Match zero or more times "*" */
andrewbonney 0:bc6b8c94c371 39 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
andrewbonney 0:bc6b8c94c371 40 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
andrewbonney 0:bc6b8c94c371 41 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
andrewbonney 0:bc6b8c94c371 42 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
andrewbonney 0:bc6b8c94c371 43 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
andrewbonney 0:bc6b8c94c371 44 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
andrewbonney 0:bc6b8c94c371 45 {"DIGIT", 0, ""} /* Match digit, "\d" */
andrewbonney 0:bc6b8c94c371 46 };
andrewbonney 0:bc6b8c94c371 47
andrewbonney 0:bc6b8c94c371 48 /*
andrewbonney 0:bc6b8c94c371 49 * Commands and operands are all unsigned char (1 byte long). All code offsets
andrewbonney 0:bc6b8c94c371 50 * are relative to current address, and positive (always point forward). Data
andrewbonney 0:bc6b8c94c371 51 * offsets are absolute. Commands with operands:
andrewbonney 0:bc6b8c94c371 52 *
andrewbonney 0:bc6b8c94c371 53 * BRANCH offset1 offset2
andrewbonney 0:bc6b8c94c371 54 * Try to match the code block that follows the BRANCH instruction
andrewbonney 0:bc6b8c94c371 55 * (code block ends with END). If no match, try to match code block that
andrewbonney 0:bc6b8c94c371 56 * starts at offset1. If either of these match, jump to offset2.
andrewbonney 0:bc6b8c94c371 57 *
andrewbonney 0:bc6b8c94c371 58 * EXACT data_offset data_length
andrewbonney 0:bc6b8c94c371 59 * Try to match exact string. String is recorded in data section from
andrewbonney 0:bc6b8c94c371 60 * data_offset, and has length data_length.
andrewbonney 0:bc6b8c94c371 61 *
andrewbonney 0:bc6b8c94c371 62 * OPEN capture_number, CLOSE capture_number
andrewbonney 0:bc6b8c94c371 63 * If the user have passed 'struct cap' array for captures, OPEN
andrewbonney 0:bc6b8c94c371 64 * records the beginning of the matched substring (cap->ptr), CLOSE
andrewbonney 0:bc6b8c94c371 65 * sets the length (cap->len) for respective capture_number.
andrewbonney 0:bc6b8c94c371 66 *
andrewbonney 0:bc6b8c94c371 67 * STAR code_offset, PLUS code_offset, QUEST code_offset
andrewbonney 0:bc6b8c94c371 68 * *, +, ?, respectively. Try to gobble as much as possible from the
andrewbonney 0:bc6b8c94c371 69 * matched buffer, until code block that follows these instructions
andrewbonney 0:bc6b8c94c371 70 * matches. When the longest possible string is matched,
andrewbonney 0:bc6b8c94c371 71 * jump to code_offset
andrewbonney 0:bc6b8c94c371 72 *
andrewbonney 0:bc6b8c94c371 73 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
andrewbonney 0:bc6b8c94c371 74 */
andrewbonney 0:bc6b8c94c371 75
andrewbonney 0:bc6b8c94c371 76 static const char *meta_chars = "|.^$*+?()[\\";
andrewbonney 0:bc6b8c94c371 77
andrewbonney 0:bc6b8c94c371 78 static void
andrewbonney 0:bc6b8c94c371 79 print_character_set(FILE *fp, const unsigned char *p, int len)
andrewbonney 0:bc6b8c94c371 80 {
andrewbonney 0:bc6b8c94c371 81 int i;
andrewbonney 0:bc6b8c94c371 82
andrewbonney 0:bc6b8c94c371 83 for (i = 0; i < len; i++) {
andrewbonney 0:bc6b8c94c371 84 if (i > 0)
andrewbonney 0:bc6b8c94c371 85 (void) fputc(',', fp);
andrewbonney 0:bc6b8c94c371 86 if (p[i] == 0) {
andrewbonney 0:bc6b8c94c371 87 i++;
andrewbonney 0:bc6b8c94c371 88 if (p[i] == 0)
andrewbonney 0:bc6b8c94c371 89 (void) fprintf(fp, "\\x%02x", p[i]);
andrewbonney 0:bc6b8c94c371 90 else
andrewbonney 0:bc6b8c94c371 91 (void) fprintf(fp, "%s", opcodes[p[i]].name);
andrewbonney 0:bc6b8c94c371 92 } else if (isprint(p[i])) {
andrewbonney 0:bc6b8c94c371 93 (void) fputc(p[i], fp);
andrewbonney 0:bc6b8c94c371 94 } else {
andrewbonney 0:bc6b8c94c371 95 (void) fprintf(fp,"\\x%02x", p[i]);
andrewbonney 0:bc6b8c94c371 96 }
andrewbonney 0:bc6b8c94c371 97 }
andrewbonney 0:bc6b8c94c371 98 }
andrewbonney 0:bc6b8c94c371 99
andrewbonney 0:bc6b8c94c371 100 void
andrewbonney 0:bc6b8c94c371 101 slre_dump(const struct slre *r, FILE *fp)
andrewbonney 0:bc6b8c94c371 102 {
andrewbonney 0:bc6b8c94c371 103 int i, j, ch, op, pc;
andrewbonney 0:bc6b8c94c371 104
andrewbonney 0:bc6b8c94c371 105 for (pc = 0; pc < r->code_size; pc++) {
andrewbonney 0:bc6b8c94c371 106
andrewbonney 0:bc6b8c94c371 107 op = r->code[pc];
andrewbonney 0:bc6b8c94c371 108 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
andrewbonney 0:bc6b8c94c371 109
andrewbonney 0:bc6b8c94c371 110 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
andrewbonney 0:bc6b8c94c371 111 switch (opcodes[op].flags[i]) {
andrewbonney 0:bc6b8c94c371 112 case 'i':
andrewbonney 0:bc6b8c94c371 113 (void) fprintf(fp, "%d ", r->code[pc + 1]);
andrewbonney 0:bc6b8c94c371 114 pc++;
andrewbonney 0:bc6b8c94c371 115 break;
andrewbonney 0:bc6b8c94c371 116 case 'o':
andrewbonney 0:bc6b8c94c371 117 (void) fprintf(fp, "%d ",
andrewbonney 0:bc6b8c94c371 118 pc + r->code[pc + 1] - i);
andrewbonney 0:bc6b8c94c371 119 pc++;
andrewbonney 0:bc6b8c94c371 120 break;
andrewbonney 0:bc6b8c94c371 121 case 'D':
andrewbonney 0:bc6b8c94c371 122 print_character_set(fp, r->data +
andrewbonney 0:bc6b8c94c371 123 r->code[pc + 1], r->code[pc + 2]);
andrewbonney 0:bc6b8c94c371 124 pc += 2;
andrewbonney 0:bc6b8c94c371 125 break;
andrewbonney 0:bc6b8c94c371 126 case 'd':
andrewbonney 0:bc6b8c94c371 127 (void) fputc('"', fp);
andrewbonney 0:bc6b8c94c371 128 for (j = 0; j < r->code[pc + 2]; j++) {
andrewbonney 0:bc6b8c94c371 129 ch = r->data[r->code[pc + 1] + j];
andrewbonney 0:bc6b8c94c371 130 if (isprint(ch))
andrewbonney 0:bc6b8c94c371 131 (void) fputc(ch, fp);
andrewbonney 0:bc6b8c94c371 132 else
andrewbonney 0:bc6b8c94c371 133 (void) fprintf(fp,"\\x%02x",ch);
andrewbonney 0:bc6b8c94c371 134 }
andrewbonney 0:bc6b8c94c371 135 (void) fputc('"', fp);
andrewbonney 0:bc6b8c94c371 136 pc += 2;
andrewbonney 0:bc6b8c94c371 137 break;
andrewbonney 0:bc6b8c94c371 138 }
andrewbonney 0:bc6b8c94c371 139
andrewbonney 0:bc6b8c94c371 140 (void) fputc('\n', fp);
andrewbonney 0:bc6b8c94c371 141 }
andrewbonney 0:bc6b8c94c371 142 }
andrewbonney 0:bc6b8c94c371 143
andrewbonney 0:bc6b8c94c371 144 static void
andrewbonney 0:bc6b8c94c371 145 set_jump_offset(struct slre *r, int pc, int offset)
andrewbonney 0:bc6b8c94c371 146 {
andrewbonney 0:bc6b8c94c371 147 assert(offset < r->code_size);
andrewbonney 0:bc6b8c94c371 148
andrewbonney 0:bc6b8c94c371 149 if (r->code_size - offset > 0xff) {
andrewbonney 0:bc6b8c94c371 150 r->err_str = "Jump offset is too big";
andrewbonney 0:bc6b8c94c371 151 } else {
andrewbonney 0:bc6b8c94c371 152 r->code[pc] = (unsigned char) (r->code_size - offset);
andrewbonney 0:bc6b8c94c371 153 }
andrewbonney 0:bc6b8c94c371 154 }
andrewbonney 0:bc6b8c94c371 155
andrewbonney 0:bc6b8c94c371 156 static void
andrewbonney 0:bc6b8c94c371 157 emit(struct slre *r, int code)
andrewbonney 0:bc6b8c94c371 158 {
andrewbonney 0:bc6b8c94c371 159 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
andrewbonney 0:bc6b8c94c371 160 r->err_str = "RE is too long (code overflow)";
andrewbonney 0:bc6b8c94c371 161 else
andrewbonney 0:bc6b8c94c371 162 r->code[r->code_size++] = (unsigned char) code;
andrewbonney 0:bc6b8c94c371 163 }
andrewbonney 0:bc6b8c94c371 164
andrewbonney 0:bc6b8c94c371 165 static void
andrewbonney 0:bc6b8c94c371 166 store_char_in_data(struct slre *r, int ch)
andrewbonney 0:bc6b8c94c371 167 {
andrewbonney 0:bc6b8c94c371 168 if (r->data_size >= (int) sizeof(r->data))
andrewbonney 0:bc6b8c94c371 169 r->err_str = "RE is too long (data overflow)";
andrewbonney 0:bc6b8c94c371 170 else
andrewbonney 0:bc6b8c94c371 171 r->data[r->data_size++] = ch;
andrewbonney 0:bc6b8c94c371 172 }
andrewbonney 0:bc6b8c94c371 173
andrewbonney 0:bc6b8c94c371 174 static void
andrewbonney 0:bc6b8c94c371 175 exact(struct slre *r, const char **re)
andrewbonney 0:bc6b8c94c371 176 {
andrewbonney 0:bc6b8c94c371 177 int old_data_size = r->data_size;
andrewbonney 0:bc6b8c94c371 178
andrewbonney 0:bc6b8c94c371 179 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
andrewbonney 0:bc6b8c94c371 180 store_char_in_data(r, *(*re)++);
andrewbonney 0:bc6b8c94c371 181
andrewbonney 0:bc6b8c94c371 182 emit(r, EXACT);
andrewbonney 0:bc6b8c94c371 183 emit(r, old_data_size);
andrewbonney 0:bc6b8c94c371 184 emit(r, r->data_size - old_data_size);
andrewbonney 0:bc6b8c94c371 185 }
andrewbonney 0:bc6b8c94c371 186
andrewbonney 0:bc6b8c94c371 187 static int
andrewbonney 0:bc6b8c94c371 188 get_escape_char(const char **re)
andrewbonney 0:bc6b8c94c371 189 {
andrewbonney 0:bc6b8c94c371 190 int res;
andrewbonney 0:bc6b8c94c371 191
andrewbonney 0:bc6b8c94c371 192 switch (*(*re)++) {
andrewbonney 0:bc6b8c94c371 193 case 'n': res = '\n'; break;
andrewbonney 0:bc6b8c94c371 194 case 'r': res = '\r'; break;
andrewbonney 0:bc6b8c94c371 195 case 't': res = '\t'; break;
andrewbonney 0:bc6b8c94c371 196 case '0': res = 0; break;
andrewbonney 0:bc6b8c94c371 197 case 'S': res = NONSPACE << 8; break;
andrewbonney 0:bc6b8c94c371 198 case 's': res = SPACE << 8; break;
andrewbonney 0:bc6b8c94c371 199 case 'd': res = DIGIT << 8; break;
andrewbonney 0:bc6b8c94c371 200 default: res = (*re)[-1]; break;
andrewbonney 0:bc6b8c94c371 201 }
andrewbonney 0:bc6b8c94c371 202
andrewbonney 0:bc6b8c94c371 203 return (res);
andrewbonney 0:bc6b8c94c371 204 }
andrewbonney 0:bc6b8c94c371 205
andrewbonney 0:bc6b8c94c371 206 static void
andrewbonney 0:bc6b8c94c371 207 anyof(struct slre *r, const char **re)
andrewbonney 0:bc6b8c94c371 208 {
andrewbonney 0:bc6b8c94c371 209 int esc, old_data_size = r->data_size, op = ANYOF;
andrewbonney 0:bc6b8c94c371 210
andrewbonney 0:bc6b8c94c371 211 if (**re == '^') {
andrewbonney 0:bc6b8c94c371 212 op = ANYBUT;
andrewbonney 0:bc6b8c94c371 213 (*re)++;
andrewbonney 0:bc6b8c94c371 214 }
andrewbonney 0:bc6b8c94c371 215
andrewbonney 0:bc6b8c94c371 216 while (**re != '\0')
andrewbonney 0:bc6b8c94c371 217
andrewbonney 0:bc6b8c94c371 218 switch (*(*re)++) {
andrewbonney 0:bc6b8c94c371 219 case ']':
andrewbonney 0:bc6b8c94c371 220 emit(r, op);
andrewbonney 0:bc6b8c94c371 221 emit(r, old_data_size);
andrewbonney 0:bc6b8c94c371 222 emit(r, r->data_size - old_data_size);
andrewbonney 0:bc6b8c94c371 223 return;
andrewbonney 0:bc6b8c94c371 224 /* NOTREACHED */
andrewbonney 0:bc6b8c94c371 225 break;
andrewbonney 0:bc6b8c94c371 226 case '\\':
andrewbonney 0:bc6b8c94c371 227 esc = get_escape_char(re);
andrewbonney 0:bc6b8c94c371 228 if ((esc & 0xff) == 0) {
andrewbonney 0:bc6b8c94c371 229 store_char_in_data(r, 0);
andrewbonney 0:bc6b8c94c371 230 store_char_in_data(r, esc >> 8);
andrewbonney 0:bc6b8c94c371 231 } else {
andrewbonney 0:bc6b8c94c371 232 store_char_in_data(r, esc);
andrewbonney 0:bc6b8c94c371 233 }
andrewbonney 0:bc6b8c94c371 234 break;
andrewbonney 0:bc6b8c94c371 235 default:
andrewbonney 0:bc6b8c94c371 236 store_char_in_data(r, (*re)[-1]);
andrewbonney 0:bc6b8c94c371 237 break;
andrewbonney 0:bc6b8c94c371 238 }
andrewbonney 0:bc6b8c94c371 239
andrewbonney 0:bc6b8c94c371 240 r->err_str = "No closing ']' bracket";
andrewbonney 0:bc6b8c94c371 241 }
andrewbonney 0:bc6b8c94c371 242
andrewbonney 0:bc6b8c94c371 243 static void
andrewbonney 0:bc6b8c94c371 244 relocate(struct slre *r, int begin, int shift)
andrewbonney 0:bc6b8c94c371 245 {
andrewbonney 0:bc6b8c94c371 246 emit(r, END);
andrewbonney 0:bc6b8c94c371 247 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
andrewbonney 0:bc6b8c94c371 248 r->code_size += shift;
andrewbonney 0:bc6b8c94c371 249 }
andrewbonney 0:bc6b8c94c371 250
andrewbonney 0:bc6b8c94c371 251 static void
andrewbonney 0:bc6b8c94c371 252 quantifier(struct slre *r, int prev, int op)
andrewbonney 0:bc6b8c94c371 253 {
andrewbonney 0:bc6b8c94c371 254 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
andrewbonney 0:bc6b8c94c371 255 r->code[prev + 2]--;
andrewbonney 0:bc6b8c94c371 256 emit(r, EXACT);
andrewbonney 0:bc6b8c94c371 257 emit(r, r->code[prev + 1] + r->code[prev + 2]);
andrewbonney 0:bc6b8c94c371 258 emit(r, 1);
andrewbonney 0:bc6b8c94c371 259 prev = r->code_size - 3;
andrewbonney 0:bc6b8c94c371 260 }
andrewbonney 0:bc6b8c94c371 261 relocate(r, prev, 2);
andrewbonney 0:bc6b8c94c371 262 r->code[prev] = op;
andrewbonney 0:bc6b8c94c371 263 set_jump_offset(r, prev + 1, prev);
andrewbonney 0:bc6b8c94c371 264 }
andrewbonney 0:bc6b8c94c371 265
andrewbonney 0:bc6b8c94c371 266 static void
andrewbonney 0:bc6b8c94c371 267 exact_one_char(struct slre *r, int ch)
andrewbonney 0:bc6b8c94c371 268 {
andrewbonney 0:bc6b8c94c371 269 emit(r, EXACT);
andrewbonney 0:bc6b8c94c371 270 emit(r, r->data_size);
andrewbonney 0:bc6b8c94c371 271 emit(r, 1);
andrewbonney 0:bc6b8c94c371 272 store_char_in_data(r, ch);
andrewbonney 0:bc6b8c94c371 273 }
andrewbonney 0:bc6b8c94c371 274
andrewbonney 0:bc6b8c94c371 275 static void
andrewbonney 0:bc6b8c94c371 276 fixup_branch(struct slre *r, int fixup)
andrewbonney 0:bc6b8c94c371 277 {
andrewbonney 0:bc6b8c94c371 278 if (fixup > 0) {
andrewbonney 0:bc6b8c94c371 279 emit(r, END);
andrewbonney 0:bc6b8c94c371 280 set_jump_offset(r, fixup, fixup - 2);
andrewbonney 0:bc6b8c94c371 281 }
andrewbonney 0:bc6b8c94c371 282 }
andrewbonney 0:bc6b8c94c371 283
andrewbonney 0:bc6b8c94c371 284 static void
andrewbonney 0:bc6b8c94c371 285 compile(struct slre *r, const char **re)
andrewbonney 0:bc6b8c94c371 286 {
andrewbonney 0:bc6b8c94c371 287 int op, esc, branch_start, last_op, fixup, cap_no, level;
andrewbonney 0:bc6b8c94c371 288
andrewbonney 0:bc6b8c94c371 289 fixup = 0;
andrewbonney 0:bc6b8c94c371 290 level = r->num_caps;
andrewbonney 0:bc6b8c94c371 291 branch_start = last_op = r->code_size;
andrewbonney 0:bc6b8c94c371 292
andrewbonney 0:bc6b8c94c371 293 for (;;)
andrewbonney 0:bc6b8c94c371 294 switch (*(*re)++) {
andrewbonney 0:bc6b8c94c371 295 case '\0':
andrewbonney 0:bc6b8c94c371 296 (*re)--;
andrewbonney 0:bc6b8c94c371 297 return;
andrewbonney 0:bc6b8c94c371 298 /* NOTREACHED */
andrewbonney 0:bc6b8c94c371 299 break;
andrewbonney 0:bc6b8c94c371 300 case '^':
andrewbonney 0:bc6b8c94c371 301 emit(r, BOL);
andrewbonney 0:bc6b8c94c371 302 break;
andrewbonney 0:bc6b8c94c371 303 case '$':
andrewbonney 0:bc6b8c94c371 304 emit(r, EOL);
andrewbonney 0:bc6b8c94c371 305 break;
andrewbonney 0:bc6b8c94c371 306 case '.':
andrewbonney 0:bc6b8c94c371 307 last_op = r->code_size;
andrewbonney 0:bc6b8c94c371 308 emit(r, ANY);
andrewbonney 0:bc6b8c94c371 309 break;
andrewbonney 0:bc6b8c94c371 310 case '[':
andrewbonney 0:bc6b8c94c371 311 anyof(r, re);
andrewbonney 0:bc6b8c94c371 312 break;
andrewbonney 0:bc6b8c94c371 313 case '\\':
andrewbonney 0:bc6b8c94c371 314 last_op = r->code_size;
andrewbonney 0:bc6b8c94c371 315 esc = get_escape_char(re);
andrewbonney 0:bc6b8c94c371 316 if (esc & 0xff00) {
andrewbonney 0:bc6b8c94c371 317 emit(r, esc >> 8);
andrewbonney 0:bc6b8c94c371 318 } else {
andrewbonney 0:bc6b8c94c371 319 exact_one_char(r, esc);
andrewbonney 0:bc6b8c94c371 320 }
andrewbonney 0:bc6b8c94c371 321 break;
andrewbonney 0:bc6b8c94c371 322 case '(':
andrewbonney 0:bc6b8c94c371 323 last_op = r->code_size;
andrewbonney 0:bc6b8c94c371 324 cap_no = ++r->num_caps;
andrewbonney 0:bc6b8c94c371 325 emit(r, OPEN);
andrewbonney 0:bc6b8c94c371 326 emit(r, cap_no);
andrewbonney 0:bc6b8c94c371 327
andrewbonney 0:bc6b8c94c371 328 compile(r, re);
andrewbonney 0:bc6b8c94c371 329 if (*(*re)++ != ')') {
andrewbonney 0:bc6b8c94c371 330 r->err_str = "No closing bracket";
andrewbonney 0:bc6b8c94c371 331 return;
andrewbonney 0:bc6b8c94c371 332 }
andrewbonney 0:bc6b8c94c371 333
andrewbonney 0:bc6b8c94c371 334 emit(r, CLOSE);
andrewbonney 0:bc6b8c94c371 335 emit(r, cap_no);
andrewbonney 0:bc6b8c94c371 336 break;
andrewbonney 0:bc6b8c94c371 337 case ')':
andrewbonney 0:bc6b8c94c371 338 (*re)--;
andrewbonney 0:bc6b8c94c371 339 fixup_branch(r, fixup);
andrewbonney 0:bc6b8c94c371 340 if (level == 0) {
andrewbonney 0:bc6b8c94c371 341 r->err_str = "Unbalanced brackets";
andrewbonney 0:bc6b8c94c371 342 return;
andrewbonney 0:bc6b8c94c371 343 }
andrewbonney 0:bc6b8c94c371 344 return;
andrewbonney 0:bc6b8c94c371 345 /* NOTREACHED */
andrewbonney 0:bc6b8c94c371 346 break;
andrewbonney 0:bc6b8c94c371 347 case '+':
andrewbonney 0:bc6b8c94c371 348 case '*':
andrewbonney 0:bc6b8c94c371 349 op = (*re)[-1] == '*' ? STAR: PLUS;
andrewbonney 0:bc6b8c94c371 350 if (**re == '?') {
andrewbonney 0:bc6b8c94c371 351 (*re)++;
andrewbonney 0:bc6b8c94c371 352 op = op == STAR ? STARQ : PLUSQ;
andrewbonney 0:bc6b8c94c371 353 }
andrewbonney 0:bc6b8c94c371 354 quantifier(r, last_op, op);
andrewbonney 0:bc6b8c94c371 355 break;
andrewbonney 0:bc6b8c94c371 356 case '?':
andrewbonney 0:bc6b8c94c371 357 quantifier(r, last_op, QUEST);
andrewbonney 0:bc6b8c94c371 358 break;
andrewbonney 0:bc6b8c94c371 359 case '|':
andrewbonney 0:bc6b8c94c371 360 fixup_branch(r, fixup);
andrewbonney 0:bc6b8c94c371 361 relocate(r, branch_start, 3);
andrewbonney 0:bc6b8c94c371 362 r->code[branch_start] = BRANCH;
andrewbonney 0:bc6b8c94c371 363 set_jump_offset(r, branch_start + 1, branch_start);
andrewbonney 0:bc6b8c94c371 364 fixup = branch_start + 2;
andrewbonney 0:bc6b8c94c371 365 r->code[fixup] = 0xff;
andrewbonney 0:bc6b8c94c371 366 break;
andrewbonney 0:bc6b8c94c371 367 default:
andrewbonney 0:bc6b8c94c371 368 (*re)--;
andrewbonney 0:bc6b8c94c371 369 last_op = r->code_size;
andrewbonney 0:bc6b8c94c371 370 exact(r, re);
andrewbonney 0:bc6b8c94c371 371 break;
andrewbonney 0:bc6b8c94c371 372 }
andrewbonney 0:bc6b8c94c371 373 }
andrewbonney 0:bc6b8c94c371 374
andrewbonney 0:bc6b8c94c371 375 int
andrewbonney 0:bc6b8c94c371 376 slre_compile(struct slre *r, const char *re)
andrewbonney 0:bc6b8c94c371 377 {
andrewbonney 0:bc6b8c94c371 378 r->err_str = NULL;
andrewbonney 0:bc6b8c94c371 379 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
andrewbonney 0:bc6b8c94c371 380
andrewbonney 0:bc6b8c94c371 381 if (*re == '^')
andrewbonney 0:bc6b8c94c371 382 r->anchored++;
andrewbonney 0:bc6b8c94c371 383
andrewbonney 0:bc6b8c94c371 384 emit(r, OPEN); /* This will capture what matches full RE */
andrewbonney 0:bc6b8c94c371 385 emit(r, 0);
andrewbonney 0:bc6b8c94c371 386
andrewbonney 0:bc6b8c94c371 387 while (*re != '\0')
andrewbonney 0:bc6b8c94c371 388 compile(r, &re);
andrewbonney 0:bc6b8c94c371 389
andrewbonney 0:bc6b8c94c371 390 if (r->code[2] == BRANCH)
andrewbonney 0:bc6b8c94c371 391 fixup_branch(r, 4);
andrewbonney 0:bc6b8c94c371 392
andrewbonney 0:bc6b8c94c371 393 emit(r, CLOSE);
andrewbonney 0:bc6b8c94c371 394 emit(r, 0);
andrewbonney 0:bc6b8c94c371 395 emit(r, END);
andrewbonney 0:bc6b8c94c371 396
andrewbonney 0:bc6b8c94c371 397 return (r->err_str == NULL ? 1 : 0);
andrewbonney 0:bc6b8c94c371 398 }
andrewbonney 0:bc6b8c94c371 399
andrewbonney 0:bc6b8c94c371 400 static int match(const struct slre *, int,
andrewbonney 0:bc6b8c94c371 401 const char *, int, int *, struct cap *);
andrewbonney 0:bc6b8c94c371 402
andrewbonney 0:bc6b8c94c371 403 static void
andrewbonney 0:bc6b8c94c371 404 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
andrewbonney 0:bc6b8c94c371 405 {
andrewbonney 0:bc6b8c94c371 406 int saved_offset, matched_offset;
andrewbonney 0:bc6b8c94c371 407
andrewbonney 0:bc6b8c94c371 408 saved_offset = matched_offset = *ofs;
andrewbonney 0:bc6b8c94c371 409
andrewbonney 0:bc6b8c94c371 410 while (match(r, pc + 2, s, len, ofs, NULL)) {
andrewbonney 0:bc6b8c94c371 411 saved_offset = *ofs;
andrewbonney 0:bc6b8c94c371 412 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
andrewbonney 0:bc6b8c94c371 413 matched_offset = saved_offset;
andrewbonney 0:bc6b8c94c371 414 *ofs = saved_offset;
andrewbonney 0:bc6b8c94c371 415 }
andrewbonney 0:bc6b8c94c371 416
andrewbonney 0:bc6b8c94c371 417 *ofs = matched_offset;
andrewbonney 0:bc6b8c94c371 418 }
andrewbonney 0:bc6b8c94c371 419
andrewbonney 0:bc6b8c94c371 420 static void
andrewbonney 0:bc6b8c94c371 421 loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
andrewbonney 0:bc6b8c94c371 422 {
andrewbonney 0:bc6b8c94c371 423 int saved_offset = *ofs;
andrewbonney 0:bc6b8c94c371 424
andrewbonney 0:bc6b8c94c371 425 while (match(r, pc + 2, s, len, ofs, NULL)) {
andrewbonney 0:bc6b8c94c371 426 saved_offset = *ofs;
andrewbonney 0:bc6b8c94c371 427 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
andrewbonney 0:bc6b8c94c371 428 break;
andrewbonney 0:bc6b8c94c371 429 }
andrewbonney 0:bc6b8c94c371 430
andrewbonney 0:bc6b8c94c371 431 *ofs = saved_offset;
andrewbonney 0:bc6b8c94c371 432 }
andrewbonney 0:bc6b8c94c371 433
andrewbonney 0:bc6b8c94c371 434 static int
andrewbonney 0:bc6b8c94c371 435 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
andrewbonney 0:bc6b8c94c371 436 {
andrewbonney 0:bc6b8c94c371 437 int i, ch;
andrewbonney 0:bc6b8c94c371 438
andrewbonney 0:bc6b8c94c371 439 ch = s[*ofs];
andrewbonney 0:bc6b8c94c371 440
andrewbonney 0:bc6b8c94c371 441 for (i = 0; i < len; i++)
andrewbonney 0:bc6b8c94c371 442 if (p[i] == ch) {
andrewbonney 0:bc6b8c94c371 443 (*ofs)++;
andrewbonney 0:bc6b8c94c371 444 return (1);
andrewbonney 0:bc6b8c94c371 445 }
andrewbonney 0:bc6b8c94c371 446
andrewbonney 0:bc6b8c94c371 447 return (0);
andrewbonney 0:bc6b8c94c371 448 }
andrewbonney 0:bc6b8c94c371 449
andrewbonney 0:bc6b8c94c371 450 static int
andrewbonney 0:bc6b8c94c371 451 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
andrewbonney 0:bc6b8c94c371 452 {
andrewbonney 0:bc6b8c94c371 453 int i, ch;
andrewbonney 0:bc6b8c94c371 454
andrewbonney 0:bc6b8c94c371 455 ch = s[*ofs];
andrewbonney 0:bc6b8c94c371 456
andrewbonney 0:bc6b8c94c371 457 for (i = 0; i < len; i++)
andrewbonney 0:bc6b8c94c371 458 if (p[i] == ch)
andrewbonney 0:bc6b8c94c371 459 return (0);
andrewbonney 0:bc6b8c94c371 460
andrewbonney 0:bc6b8c94c371 461 (*ofs)++;
andrewbonney 0:bc6b8c94c371 462 return (1);
andrewbonney 0:bc6b8c94c371 463 }
andrewbonney 0:bc6b8c94c371 464
andrewbonney 0:bc6b8c94c371 465 static int
andrewbonney 0:bc6b8c94c371 466 match(const struct slre *r, int pc, const char *s, int len,
andrewbonney 0:bc6b8c94c371 467 int *ofs, struct cap *caps)
andrewbonney 0:bc6b8c94c371 468 {
andrewbonney 0:bc6b8c94c371 469 int n, saved_offset, res = 1;
andrewbonney 0:bc6b8c94c371 470
andrewbonney 0:bc6b8c94c371 471 while (res && r->code[pc] != END) {
andrewbonney 0:bc6b8c94c371 472
andrewbonney 0:bc6b8c94c371 473 assert(pc < r->code_size);
andrewbonney 0:bc6b8c94c371 474 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
andrewbonney 0:bc6b8c94c371 475
andrewbonney 0:bc6b8c94c371 476 switch (r->code[pc]) {
andrewbonney 0:bc6b8c94c371 477 case BRANCH:
andrewbonney 0:bc6b8c94c371 478 saved_offset = *ofs;
andrewbonney 0:bc6b8c94c371 479 res = match(r, pc + 3, s, len, ofs, caps);
andrewbonney 0:bc6b8c94c371 480 if (res == 0) {
andrewbonney 0:bc6b8c94c371 481 *ofs = saved_offset;
andrewbonney 0:bc6b8c94c371 482 res = match(r, pc + r->code[pc + 1],
andrewbonney 0:bc6b8c94c371 483 s, len, ofs, caps);
andrewbonney 0:bc6b8c94c371 484 }
andrewbonney 0:bc6b8c94c371 485 pc += r->code[pc + 2];
andrewbonney 0:bc6b8c94c371 486 break;
andrewbonney 0:bc6b8c94c371 487 case EXACT:
andrewbonney 0:bc6b8c94c371 488 res = 0;
andrewbonney 0:bc6b8c94c371 489 n = r->code[pc + 2]; /* String length */
andrewbonney 0:bc6b8c94c371 490 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
andrewbonney 0:bc6b8c94c371 491 r->code[pc + 1], n)) {
andrewbonney 0:bc6b8c94c371 492 (*ofs) += n;
andrewbonney 0:bc6b8c94c371 493 res = 1;
andrewbonney 0:bc6b8c94c371 494 }
andrewbonney 0:bc6b8c94c371 495 pc += 3;
andrewbonney 0:bc6b8c94c371 496 break;
andrewbonney 0:bc6b8c94c371 497 case QUEST:
andrewbonney 0:bc6b8c94c371 498 res = 1;
andrewbonney 0:bc6b8c94c371 499 saved_offset = *ofs;
andrewbonney 0:bc6b8c94c371 500 if (!match(r, pc + 2, s, len, ofs, caps))
andrewbonney 0:bc6b8c94c371 501 *ofs = saved_offset;
andrewbonney 0:bc6b8c94c371 502 pc += r->code[pc + 1];
andrewbonney 0:bc6b8c94c371 503 break;
andrewbonney 0:bc6b8c94c371 504 case STAR:
andrewbonney 0:bc6b8c94c371 505 res = 1;
andrewbonney 0:bc6b8c94c371 506 loop_greedy(r, pc, s, len, ofs);
andrewbonney 0:bc6b8c94c371 507 pc += r->code[pc + 1];
andrewbonney 0:bc6b8c94c371 508 break;
andrewbonney 0:bc6b8c94c371 509 case STARQ:
andrewbonney 0:bc6b8c94c371 510 res = 1;
andrewbonney 0:bc6b8c94c371 511 loop_non_greedy(r, pc, s, len, ofs);
andrewbonney 0:bc6b8c94c371 512 pc += r->code[pc + 1];
andrewbonney 0:bc6b8c94c371 513 break;
andrewbonney 0:bc6b8c94c371 514 case PLUS:
andrewbonney 0:bc6b8c94c371 515 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
andrewbonney 0:bc6b8c94c371 516 break;
andrewbonney 0:bc6b8c94c371 517
andrewbonney 0:bc6b8c94c371 518 loop_greedy(r, pc, s, len, ofs);
andrewbonney 0:bc6b8c94c371 519 pc += r->code[pc + 1];
andrewbonney 0:bc6b8c94c371 520 break;
andrewbonney 0:bc6b8c94c371 521 case PLUSQ:
andrewbonney 0:bc6b8c94c371 522 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
andrewbonney 0:bc6b8c94c371 523 break;
andrewbonney 0:bc6b8c94c371 524
andrewbonney 0:bc6b8c94c371 525 loop_non_greedy(r, pc, s, len, ofs);
andrewbonney 0:bc6b8c94c371 526 pc += r->code[pc + 1];
andrewbonney 0:bc6b8c94c371 527 break;
andrewbonney 0:bc6b8c94c371 528 case SPACE:
andrewbonney 0:bc6b8c94c371 529 res = 0;
andrewbonney 0:bc6b8c94c371 530 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
andrewbonney 0:bc6b8c94c371 531 (*ofs)++;
andrewbonney 0:bc6b8c94c371 532 res = 1;
andrewbonney 0:bc6b8c94c371 533 }
andrewbonney 0:bc6b8c94c371 534 pc++;
andrewbonney 0:bc6b8c94c371 535 break;
andrewbonney 0:bc6b8c94c371 536 case NONSPACE:
andrewbonney 0:bc6b8c94c371 537 res = 0;
andrewbonney 0:bc6b8c94c371 538 if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
andrewbonney 0:bc6b8c94c371 539 (*ofs)++;
andrewbonney 0:bc6b8c94c371 540 res = 1;
andrewbonney 0:bc6b8c94c371 541 }
andrewbonney 0:bc6b8c94c371 542 pc++;
andrewbonney 0:bc6b8c94c371 543 break;
andrewbonney 0:bc6b8c94c371 544 case DIGIT:
andrewbonney 0:bc6b8c94c371 545 res = 0;
andrewbonney 0:bc6b8c94c371 546 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
andrewbonney 0:bc6b8c94c371 547 (*ofs)++;
andrewbonney 0:bc6b8c94c371 548 res = 1;
andrewbonney 0:bc6b8c94c371 549 }
andrewbonney 0:bc6b8c94c371 550 pc++;
andrewbonney 0:bc6b8c94c371 551 break;
andrewbonney 0:bc6b8c94c371 552 case ANY:
andrewbonney 0:bc6b8c94c371 553 res = 0;
andrewbonney 0:bc6b8c94c371 554 if (*ofs < len) {
andrewbonney 0:bc6b8c94c371 555 (*ofs)++;
andrewbonney 0:bc6b8c94c371 556 res = 1;
andrewbonney 0:bc6b8c94c371 557 }
andrewbonney 0:bc6b8c94c371 558 pc++;
andrewbonney 0:bc6b8c94c371 559 break;
andrewbonney 0:bc6b8c94c371 560 case ANYOF:
andrewbonney 0:bc6b8c94c371 561 res = 0;
andrewbonney 0:bc6b8c94c371 562 if (*ofs < len)
andrewbonney 0:bc6b8c94c371 563 res = is_any_of(r->data + r->code[pc + 1],
andrewbonney 0:bc6b8c94c371 564 r->code[pc + 2], s, ofs);
andrewbonney 0:bc6b8c94c371 565 pc += 3;
andrewbonney 0:bc6b8c94c371 566 break;
andrewbonney 0:bc6b8c94c371 567 case ANYBUT:
andrewbonney 0:bc6b8c94c371 568 res = 0;
andrewbonney 0:bc6b8c94c371 569 if (*ofs < len)
andrewbonney 0:bc6b8c94c371 570 res = is_any_but(r->data + r->code[pc + 1],
andrewbonney 0:bc6b8c94c371 571 r->code[pc + 2], s, ofs);
andrewbonney 0:bc6b8c94c371 572 pc += 3;
andrewbonney 0:bc6b8c94c371 573 break;
andrewbonney 0:bc6b8c94c371 574 case BOL:
andrewbonney 0:bc6b8c94c371 575 res = *ofs == 0 ? 1 : 0;
andrewbonney 0:bc6b8c94c371 576 pc++;
andrewbonney 0:bc6b8c94c371 577 break;
andrewbonney 0:bc6b8c94c371 578 case EOL:
andrewbonney 0:bc6b8c94c371 579 res = *ofs == len ? 1 : 0;
andrewbonney 0:bc6b8c94c371 580 pc++;
andrewbonney 0:bc6b8c94c371 581 break;
andrewbonney 0:bc6b8c94c371 582 case OPEN:
andrewbonney 0:bc6b8c94c371 583 if (caps != NULL)
andrewbonney 0:bc6b8c94c371 584 caps[r->code[pc + 1]].ptr = s + *ofs;
andrewbonney 0:bc6b8c94c371 585 pc += 2;
andrewbonney 0:bc6b8c94c371 586 break;
andrewbonney 0:bc6b8c94c371 587 case CLOSE:
andrewbonney 0:bc6b8c94c371 588 if (caps != NULL)
andrewbonney 0:bc6b8c94c371 589 caps[r->code[pc + 1]].len = (s + *ofs) -
andrewbonney 0:bc6b8c94c371 590 caps[r->code[pc + 1]].ptr;
andrewbonney 0:bc6b8c94c371 591 pc += 2;
andrewbonney 0:bc6b8c94c371 592 break;
andrewbonney 0:bc6b8c94c371 593 case END:
andrewbonney 0:bc6b8c94c371 594 pc++;
andrewbonney 0:bc6b8c94c371 595 break;
andrewbonney 0:bc6b8c94c371 596 default:
andrewbonney 0:bc6b8c94c371 597 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
andrewbonney 0:bc6b8c94c371 598 assert(0);
andrewbonney 0:bc6b8c94c371 599 break;
andrewbonney 0:bc6b8c94c371 600 }
andrewbonney 0:bc6b8c94c371 601 }
andrewbonney 0:bc6b8c94c371 602
andrewbonney 0:bc6b8c94c371 603 return (res);
andrewbonney 0:bc6b8c94c371 604 }
andrewbonney 0:bc6b8c94c371 605
andrewbonney 0:bc6b8c94c371 606 int
andrewbonney 0:bc6b8c94c371 607 slre_match(const struct slre *r, const char *buf, int len,
andrewbonney 0:bc6b8c94c371 608 struct cap *caps)
andrewbonney 0:bc6b8c94c371 609 {
andrewbonney 0:bc6b8c94c371 610 int i, ofs = 0, res = 0;
andrewbonney 0:bc6b8c94c371 611
andrewbonney 0:bc6b8c94c371 612 if (r->anchored) {
andrewbonney 0:bc6b8c94c371 613 res = match(r, 0, buf, len, &ofs, caps);
andrewbonney 0:bc6b8c94c371 614 } else {
andrewbonney 0:bc6b8c94c371 615 for (i = 0; i < len && res == 0; i++) {
andrewbonney 0:bc6b8c94c371 616 ofs = i;
andrewbonney 0:bc6b8c94c371 617 res = match(r, 0, buf, len, &ofs, caps);
andrewbonney 0:bc6b8c94c371 618 }
andrewbonney 0:bc6b8c94c371 619 }
andrewbonney 0:bc6b8c94c371 620
andrewbonney 0:bc6b8c94c371 621 return (res);
andrewbonney 0:bc6b8c94c371 622 }
andrewbonney 0:bc6b8c94c371 623