Source

Murky / Source / RegEx.m

//
//  RegEx.m
//  Murky
//
//  Copyright 2008-2009 Jens Alfke. All rights reserved.
//

#import <Foundation/Foundation.h>

//#define U_HIDE_DRAFT_API 1
#define U_DISABLE_RENAMING 1
#include "unicode/umachine.h"
#include "unicode/uregex.h"

#import "RegEx.h"


/** Gets the Unicode characters of a string as efficiently as possible.
    You must call EndUsingStringChars(STR) after you're done, or memory may be leaked. */
#define UsingStringChars(STR,CHARS,LEN) \
    unichar *_mallocedBuffer = NULL; \
    LEN = STR ?CFStringGetLength((CFStringRef)STR) :0; \
    if( LEN == 0 ) { \
        CHARS = NULL; \
    } else {\
        CHARS = CFStringGetCharactersPtr((CFStringRef)STR); \
        if( !chars ) { \
            if( LEN < 256 ) \
                CHARS = alloca(LEN*sizeof(unichar)); \
            else \
                CHARS = _mallocedBuffer = malloc(LEN*sizeof(unichar)); \
            CFStringGetCharacters((CFStringRef)STR,(CFRange){0,LEN},_mallocedBuffer); \
        } \
    } \

#define EndUsingStringChars(STR) \
    if( _mallocedBuffer ) free(_mallocedBuffer);


/** Gets the Unicode characters of a string, either as a direct pointer into the string,
    or by copying them into a malloc'ed buffer. */
static const unichar* GetStringChars( CFStringRef str, unsigned *length, BOOL *malloced )
{
    *length = str ?CFStringGetLength(str) :0;
    if( *length == 0 ) {
        *malloced = NO;
        return NULL;
    }
    const unichar *chars = CFStringGetCharactersPtr((CFStringRef)str);
    if( chars )
        *malloced = NO;
    else {
        /*Log(@"getCharsPtr failed on \"%@\":",str); CFShowStr(str);
        Log(@"fastestEncoding = %i", CFStringGetFastestEncoding(str));
        Log(@"getCStringPtr would have returned %p", CFStringGetCStringPtr(str, kCFStringEncodingUTF8));*/
        chars = malloc(*length*sizeof(unichar));
        CFStringGetCharacters(str,(CFRange){0,*length},(unichar*)chars);
        *malloced = YES;
    }
    return chars;
}


@implementation RegEx


- (id) initWithPattern: (NSString*)pattern 
               options: (RegExOptions)options
{
    self = [super init];
    if( self ) {
        const unichar *chars;
        unsigned length;
        UsingStringChars(pattern, chars,length);
        
        NSAssert(length>0 && length<1024,@"bad pattern string");
        
        UParseError errPos;
        UErrorCode errCode = 0;
        _regexp = uregex_open(chars, length, options,
                              &errPos,&errCode);
        
        EndUsingStringChars(pattern);
        
        NSAssert3(_regexp, @"RegEx parse error %i at position %i of \"%@\"",
                  errCode,errPos.offset,pattern);
    }
    return self;
}

- (id) initWithCPattern: (const char*)pattern 
                options: (RegExOptions)options
{
    self = [super init];
    if( self ) {
        UParseError errPos;
        UErrorCode errCode = 0;
        _regexp = uregex_openC(pattern, options,
                               &errPos,&errCode);
        NSAssert3(_regexp, @"RegEx parse error %i at position %i of \"%s\"",
                  errCode,errPos.offset,pattern);
    }
    return self;
}

- (id) _initWithURegularExpression: (URegularExpression*)regexp
{
    self = [super init];
    if( self )
        _regexp = regexp;
    return self;
}


+ (RegEx*) regExWithPattern: (NSString*)pattern 
                    options: (RegExOptions)options
{
    return [[[self alloc] initWithPattern: pattern options: options] autorelease];
}


- (void) dealloc
{
    if( _regexp )
        uregex_close(_regexp);
    [super dealloc];
}

- (void) finalize
{
    if( _regexp )
        uregex_close(_regexp);
    [super finalize];
}


- (id) copyWithZone: (NSZone*)zone
{
    int err;
    URegularExpression *clonedRegexp = uregex_clone(_regexp,&err);
    if( clonedRegexp )
        return [[[self class] allocWithZone: zone] _initWithURegularExpression: clonedRegexp];
    else
        return nil;
}


- (RegExMatcher*) matcherForString: (NSString*)string
{
    return [[[RegExMatcher alloc] initWithRegEx: self searchString: string] autorelease];
}


- (BOOL) matchesString: (NSString*)str
{
    const unichar *chars;
    unsigned length;
    UsingStringChars(str,chars,length);
    
    BOOL matches;
    if( chars ) {
        int err;
        uregex_setText(_regexp,chars,length,&err);
        matches = err==0 && uregex_matches(_regexp,0,&err);
    } else
        matches = NO;
    
    EndUsingStringChars(str);
    return matches;
}


@end




@implementation RegExMatcher


- (id) initWithRegEx: (RegEx*)r searchString: (NSString*)str
{
    self = [super _initWithURegularExpression: r->_regexp];
    [self setSearchString: str];
    return self;
}


- (void) dealloc
{
    if( _regexp )
        uregex_close(_regexp);
    if( _mallocedChars )
        free(_mallocedChars);
    [_retainedStr release];
    [super dealloc];
}

- (void) finalize
{
    if( _regexp )
        uregex_close(_regexp);
    if( _mallocedChars )
        free(_mallocedChars);
    [super finalize];
}


- (int) lastError
{
    return _err;
}


- (BOOL) setSearchString: (NSString*)str
{
    if( _mallocedChars ) {
        free(_mallocedChars);
        _mallocedChars = NULL;
    }
    [_retainedStr release];
    _retainedStr = nil;
    
    unsigned length;
    BOOL malloced;
    const unichar *chars = GetStringChars((CFStringRef)str,&length,&malloced);
    
    uregex_setText(_regexp,chars,length,&_err);
    if( _err==0 ) {
        if( malloced )
            _mallocedChars = (unichar*) chars;
        else
            _retainedStr = [str retain];
        uregex_reset(_regexp,0,&_err);
    } else {
        if( malloced )
            free((unichar*)chars);
    }
    return (_err==0);
}


- (NSString*) _rangeOfSearchString: (NSRange)r
{
    if( r.length == 0 )
        return nil;
    else if( _retainedStr )
        return [_retainedStr substringWithRange: r];
    else {
        NSAssert(_mallocedChars!=nil,@"no _mallocedChars?!");
        return [NSString stringWithCharacters: _mallocedChars+r.location
                                       length: r.length];
    }
}


- (BOOL) matches
{
    return uregex_matches(_regexp,0,&_err);
}

- (BOOL) matchesFrom: (unsigned)startIndex
{
    return uregex_matches(_regexp,startIndex,&_err);
}


- (void) resetMatches
{
    uregex_reset(_regexp,0,&_err);
}

- (BOOL) findNextMatch
{
    return uregex_findNext(_regexp,&_err);
}


- (NSRange) rangeOfMatch
{
    NSRange result;
    result.location = uregex_start(_regexp,0,&_err);
    if( ! _err )
        result.length = uregex_end(_regexp,0,&_err) - result.location;
    if( _err ) {
        result.location = NSNotFound;
        result.length = 0;
    }
    return result;
}

- (NSString*) matchedString
{
    return [self _rangeOfSearchString: [self rangeOfMatch]];
}


- (NSRange) rangeOfMatchedGroup: (unsigned)group
{
    NSRange result;
    result.location = uregex_start(_regexp,group,&_err);
    if( ! _err )
        result.length = uregex_end(_regexp,group,&_err) - result.location;
    if( _err ) {
        result.location = NSNotFound;
        result.length = 0;
    }
    return result;
}

- (NSString*) matchedGroup: (unsigned)group 
{
    return [self _rangeOfSearchString: [self rangeOfMatchedGroup: group]];
}

@end






@implementation NSString (RegEx)

- (BOOL) matchesRegEx: (RegEx*)r
{
    return [r matchesString: self];
}

@end