[Asterisk-Dev] better pattern matcher
Reini Urban
asterisk-dev@lists.digium.com
Mon, 30 Jun 2003 15:47:00 +0200
This is a multi-part message in MIME format.
--------------080402040903010907010407
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
attached is my latest matcher.
fixed a bug in the quantifiers and added support for alternations.
it's not yet solid-proof, but should work for most patterns.
see the tests at the end.
Reini Urban wrote:
> old matcher:
> * a regex must start with "_"
> * regex patterns are case-insensitive except characters inside []
> * "." matches zero or more characters (as in * in glob)
> * character ranges as in [0-9a-zA-Z]
> * X,Z,N match 0-9,1-9,2-9 resp.
>
> plus the new one with additional features:
> * "?" matches any character
> * negation as in [^0] ("any char but 0")
> or [^a-z]
> * explicit quantifiers as in X{2,4} ("from 2 to 4 digits"),
> or X{2,} ("at least 2 digits"),
> or X{2} ("exactly 2 digits"),
> * regex-style quantifiers like ?, + and * are supported
> by "{}" grouping.
> ? <=> {0,1}
> + <=> {1,}
> * <=> {0,}
> * grouping as in N(1X){1,2} ("one or two sequences of 1X")
> * capturing
> with () grouped matches are stored in subsequent numbered global
> variables, starting with $1, $2 and so on.
> * alternation as in (01|0|99) ("01 or 0 or 99")
--
Reini Urban - Entwicklung - http://inode.at
--------------080402040903010907010407
Content-Type: text/x-c-code;
name="testmatch.c"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="testmatch.c"
/*
* Asterisk -- A telephony toolkit for Linux.
*
* PBX regex matcher test.
*
* Written by Reini Urban <rurban@inode.at>
* Given to the public domain.
*/
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
/* this is just for standalone testing */
#include <stdarg.h>
#define LOG_WARNING 3
#define LOG_DEBUG 0
extern void ast_log(int level, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
vfprintf(stdout, fmt, ap);
va_end(ap);
fflush(stdout);
}
/* end of standalone test functions */
int patmatch_groupcounter = 0;
char patmatch_group[80] = "";
/* derived from code by Steffen Offermann 1991 (public domain)
http://www.cs.umu.se/~isak/Snippets/xstrcmp.c
*/
int ast_extension_patmatch(const char *pattern, char *data)
{
int i,border=0;
char *where;
static char prev = '\0';
static char groupdata[80] = "";
static char *group = patmatch_group;
int groupcounter = patmatch_groupcounter;
ast_log(LOG_DEBUG, " >>> \"%s\" =~ /%s/\n", data, pattern);
switch (toupper(*pattern))
{
case '\0':
ast_log(LOG_DEBUG, " !>>> \"%s\" => %s\n", data, !*data ? "OK" : "FAIL");
return !*data;
case ' ':
case '-':
/* Ignore these characters in the pattern */
return *data && ast_extension_patmatch(pattern+1, data);
case '.' : /* wildcard as '*' in glob(). Match any sequence of characters. 0 or more */
prev = *pattern;
if (! *(pattern+1) )
return 1; /* return *data; => match one or more */
else
return ast_extension_patmatch(pattern+1, data) || (*data && ast_extension_patmatch(pattern, data+1));
/* wildcard character: Match any char */
case '?' :
prev = *pattern;
return *data && ast_extension_patmatch(pattern+1, data+1);
case 'X': /* 0-9 */
prev = *pattern;
return ((*data >= '0') && (*data <= '9')) && ast_extension_patmatch(pattern+1, data+1);
case 'Z': /* 1-9 */
prev = *pattern;
return ((*data >= '1') && (*data <= '9')) && ast_extension_patmatch(pattern+1, data+1);
case 'N': /* 2-9 */
prev = *pattern;
return ((*data >= '2') && (*data <= '9')) && ast_extension_patmatch(pattern+1, data+1);
case '{': /* quantifier {n[,m]} */
{
char *comma;
int cpos;
where=strchr(pattern,'}');
if (where) {
border=(int)(where-pattern);
comma = strchr(pattern,',');
}
if (!where || border > strlen(pattern)) {
ast_log(LOG_WARNING, "Wrong %s pattern usage\n", pattern);
return 0;
} else {
char tmp[8];
int from, to;
if (comma)
cpos = (int)(comma-pattern);
else
cpos = border;
strncpy(tmp,pattern+1,cpos-1);
tmp[cpos-1] = '\0';
from = atoi(tmp);
if (comma) {
if (border-cpos > 1) { /* {f,t} */
strncpy(tmp,comma+1,border-cpos);
tmp[border-cpos+1] = '\0';
to = atoi(tmp);
} else { /* {f,} */
to = strlen(data); /* may fail if after the group are more pattern chars */
if (*(pattern+border+1)) {
to = to - strlen(pattern+border+1) + 1;
}
}
} else { /* {f} */
if (from == 0) {
ast_log(LOG_WARNING, "Invalid {0} pattern quantifier %s\n", pattern);
return 0;
}
to = from;
}
if (from < 0 || to <= 0 || to < from) {
ast_log(LOG_WARNING, "Invalid pattern quantifier %s\n", pattern);
return 0;
}
if (*group) { /* check for repeated pattern{n,m} in previous group */
int i;
for (i=0; i< strlen(group); i++) {
data--;
}
ast_log(LOG_DEBUG, ">>> check for repeated pattern{%d,%d} of group '%s' in data '%s'\n", from, to, group, data);
strcat(group,".");
} else {
ast_log(LOG_DEBUG, ">>> check for repeated pattern{%d,%d} in previous character '%c'\n", from, to, prev);
data--;
group[0] = prev;
group[1] = '.';
group[2] = '\0';
}
*tmp = prev;
for (i=to; i>=from; i--) {
if (ast_extension_patmatch_repeated(group,data,i)) break;
}
prev = *tmp;
if (i >= from || !from) { /* if found */
ast_log(LOG_DEBUG, " >>>> found '%s' in data '%s' after %d runs\n", group, data, i);
char name[16];
data = data + (i * (strlen(group)- 1)) - 1;
int l = strlen(groupdata) - strlen(data);
/* data = data-i+from-1; */ /* possible failure here! */
if (prev == ')') { /* grouping => capture */
*(group+strlen(group)-1) = '\0';
groupdata[l+1] = '\0';
ast_log(LOG_DEBUG, " >>>>> end of group '%s', data: %s\n", group, groupdata);
/* capture the found data in variables $1, $2, ... */
sprintf(name,"%d",++groupcounter);
#ifdef AST_PBX_MAX_STACK
pbx_builtin_setvar_helper(NULL,name,groupdata);
#endif
ast_log(LOG_DEBUG, " >>>>> global variable $%s set to '%s'\n", name, groupdata);
}
}
*group = '\0';
prev = '\0';
if (i >= from) { /* found: continue */
ast_log(LOG_DEBUG, " >>>> found in round %d from %d\n", i, to);
if (*data) {
if (*(pattern+border+1)) /* if the tail check fails, try the other rounds */
if (ast_extension_patmatch(pattern+border+1, data+1))
return 1;
else return (ast_extension_patmatch_repeated(group, data, i--) &&
ast_extension_patmatch(pattern+border+1, data+i));
else
return ast_extension_patmatch(pattern+border+1, data+1);
}
else
return 1;
} else if (from == 0) { /* not found, but special case from=0: no match needed */
ast_log(LOG_DEBUG, " >>>> not found, but no match needed and data exhausted\n");
if (*data)
return ast_extension_patmatch(pattern+border+1, data+1);
else
return 1;
} else /* not found */
return 0;
}
}
/* unreachable code */
case '(': /* grouping */
prev = *pattern;
if (*group) {
ast_log(LOG_WARNING, "Unexpected subgroup ( in pattern %s\n", pattern);
return 0;
}
where=strchr(pattern,')');
if (where)
border=(int)(where-pattern);
if (!where || border > strlen(pattern)) {
ast_log(LOG_WARNING, "Wrong (%s) pattern usage\n", pattern);
return 0;
}
strncpy(group,pattern+1,border-1);
group[border-1] = '\0';
strcpy(groupdata,data);
ast_log(LOG_DEBUG, ">>> group '%s' stored, data: '%s'\n", group, groupdata);
if (strchr(pattern,'|')) { /* alternations */
char *s, *scopy, *sep, *sepcopy;
s = scopy = (char *) malloc(strlen(pattern));
sepcopy = (char *) malloc(strlen(pattern));
strcpy(s,group);
while (sep = strsep(&s,"|")) {
strcpy(sepcopy,sep);
strcat(sepcopy,pattern+border+1);
ast_log(LOG_DEBUG, " >>>> alternative '%s' =~ /%s/\n", sepcopy, data);
if (ast_extension_patmatch(sepcopy, data)) break;
if (!*data) {
sep = NULL; break;
}
}
free(scopy);
if (sep) { /* found */
free(sepcopy);
return 1;
} else {
free(sepcopy);
return 0;
}
} else {
return ast_extension_patmatch(pattern+1, data);
}
case ')': /* group end */
prev = *pattern;
if (!*group) {
ast_log(LOG_WARNING, "Unexpected ) in pattern %s\n", pattern);
return 0;
} else {
if (pattern[1] != '{') { /* capture without quantifiers */
char name[16];
int l = strlen(groupdata) - strlen(data);
groupdata[l-1] = '\0';
*(group+strlen(group)-1) = '\0';
ast_log(LOG_DEBUG, ">>> end of group '%s', data: %s\n", group, groupdata);
/* capture the found data in variables $1, $2, ... */
sprintf(name,"%d",++groupcounter);
#ifdef AST_PBX_MAX_STACK
pbx_builtin_setvar_helper(NULL,name,groupdata);
#endif
ast_log(LOG_DEBUG, ">>> global variable $%s set to '%s'\n", name, groupdata);
*group = '\0';
}
}
return ast_extension_patmatch(pattern+1, data);
case '|': /* alternation */
if (!*group) {
ast_log(LOG_WARNING, "Need group for | in %s\n", pattern);
return 0;
}
#if 0
{
char *s, *sep;
s = strdup(group);
while (sep = strsep(&s,"|")) {
ast_log(LOG_DEBUG, " >>>> alternative %d\n", sep);
if (ast_extension_patmatch(sep, data)) break;
if (!*data) {
*sep = '\0'; break;
}
}
if (*sep) { /* found */
free(s);
return ast_extension_patmatch(pattern+1, data+1);
} else {
free(s);
return 0;
}
}
#endif
case '[': /* Character ranges: [0-9a-zA-Z] */
prev = *pattern;
pattern++;
where=strchr(pattern,']');
if (where)
border=(int)(where-pattern);
if (!where || border > strlen(pattern)) {
ast_log(LOG_WARNING, "Wrong [%s] pattern usage\n", pattern);
return 0;
}
if (*pattern == '^') { /* Negation like [^...] */
for (i=1; i<border; i++) {
if (*data==pattern[i])
return 0;
else if ((pattern[i+1]=='-') && (i+2<border)) {
if (*data >= pattern[i] && *data <= pattern[i+2]) {
return 0;
} else {
i+=2;
continue;
}
}
}
return ast_extension_patmatch(where+1, data+1);
} else {
for (i=0; i<border; i++) {
if (i+2<border) {
if (*data==pattern[i])
return ast_extension_patmatch(where+1, data+1);
else if (pattern[i+1]=='-') {
if (*data >= pattern[i] && *data <= pattern[i+2]) {
return ast_extension_patmatch(where+1, data+1);
} else {
i+=2;
continue;
}
}
}
}
}
break;
default :
prev = *pattern;
return (toupper(*pattern) == toupper(*data)) && ast_extension_patmatch(pattern+1, data+1);
}
return 0;
}
/* try exactly num repetitions, from high to from */
int ast_extension_patmatch_repeated(const char *pattern, char *data, const int num)
{
int i;
ast_log(LOG_DEBUG, " >>> try %d repetitions of '%s' in data '%s'\n", num, pattern, data);
if (num <= 0) return 0;
for (i=1; i<=num; i++) {
ast_log(LOG_DEBUG, " >>>> round %d with data %s\n", i, data);
if (!ast_extension_patmatch(pattern, data)) return 0;
data = data + strlen(pattern) - 1;
}
return 1;
}
int ast_extension_match(char *pattern, char *data)
{
int match;
patmatch_groupcounter = 0;
*patmatch_group = '\0';
if (!*pattern) {
ast_log(LOG_WARNING, "ast_extension_match: empty pattern\n");
return 0;
}
if (!*data) {
ast_log(LOG_WARNING, "ast_extension_match: empty data\n");
return 0;
}
if (pattern[0] != '_') {
match = (strcmp(pattern, data) == 0);
ast_log(LOG_DEBUG, "ast_extension_match %s == /%s/ => %d\n", data, pattern, match);
} else {
match = ast_extension_patmatch(pattern+1,data);
ast_log(LOG_DEBUG, "ast_extension_match %s =~ /%s/ => %d\n", data, pattern+1, match);
}
return match;
}
int testmatch(char *pattern, char *data, int should)
{
int match;
match = ast_extension_match(pattern, data);
if (should == match) {
printf("OK\n");
} else {
printf("FAIL\n");
exit(-1);
}
return match;
}
int main (int argc, char *argv[]) {
char data1[] = "0316890002";
char data2[] = "03168900028500";
/* plain strcmp */
testmatch("0316890002",data1,1);
/* simple regex with ending . */
testmatch("_0N.",data1,1);
/* not terminating . */
testmatch("_0N.0",data1,0);
#if 1
testmatch("_0N. 8500",data1,0);
testmatch("_0N. 8500",data2,1);
testmatch("_0[2-9]. 8500",data2,1);
/* ranges */
testmatch("_[a-z]o[0-9a-z].","voicemail",1);
testmatch("_[0]o[0-9a-z].","voicemail",0);
/* negation */
testmatch("_[^0-9]o.","voicemail",1);
testmatch("_[^x]o.","voicemail",1);
testmatch("_[^v]o.","voicemail",0);
testmatch("_[^a-z]o.","voicemail",0);
#endif
/* quantifiers */
testmatch("_0316890{2}N","0316890002",0);
testmatch("_0316890{3}N","0316890002",1);
testmatch("_0316890{1,}N","0316890002",1);
testmatch("_0316890{1,3}N","0316890002",1);
testmatch("_0316890{4,5}N","0316890002",0);
/* grouping and capturing */
testmatch("_031689(0XX)N","0316890002",1);
testmatch("_031689(0X9)N","0316890002",0);
testmatch("_031689(X){1,3}N","0316890002",1);
testmatch("_031689(X){4,3}N","0316890002",0);
testmatch("_031689(X){3}N","0316890002",1);
testmatch("_031689(X){4}","0316890002",1);
testmatch("_031689(X){5}","0316890002",0);
testmatch("_031689(0){3}N","0316890002",1);
testmatch("_031689(X){4}N","0316890002",0);
testmatch("_031689(X){4}N","03168900021",0);
testmatch("_031689(X){4}Z","03168900021",1);
testmatch("_031689(XX){2}Z","03168900021",1);
/* alternation */
testmatch("_(032|02)N.","0316890002",0);
#if 1
testmatch("_(031N|0)N.","0316890002",1); /* not yet supported */
#endif
}
--------------080402040903010907010407--