#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from tocoli.spell import lookup
from tocoli import PY2, string_types
# Character classes
ANY = r'.'
WORD = r'\w'
NOT_WORD = r'\W'
DIGIT = r'\d'
NOT_DIGIT = r'\D'
WHITESPACE = r'\s'
NOT_WHITESPACE = r'\S'
[docs]class CharClass:
ANY = ANY
WORD = WORD
NOT_WORD = NOT_WORD
DIGIT = DIGIT
NOT_DIGIT = NOT_DIGIT
WHITESPACE = WHITESPACE
NOT_WHITESPACE = NOT_WHITESPACE
[docs]def set(chars, expr=None):
if PY2:
from __builtin__ import range
chars = unicode(chars)
else:
from builtins import range
chars = str(chars)
p = re.compile(r'(\\?.-\\?.|\\.)')
rs = p.findall(chars)
for r in rs:
chars = chars.replace(r, '')
for r in rs:
for c in chars:
if ord(c) in range(ord(r[0]), ord(r[2])+1):
chars = chars.replace(c, '')
s = u'[{}]'.format(u''.join(sorted(rs) + sorted({c for c in chars})))
if expr is None:
return Re(s)
else:
return Re(expr + s)
[docs]def negated_set(chars, expr=None):
if PY2:
from __builtin__ import range
chars = unicode(chars)
else:
from builtins import range
chars = str(chars)
p = re.compile(r'(\\?.-\\?.|\\.)')
rs = p.findall(chars)
for r in rs:
chars = chars.replace(r, '')
for r in rs:
for c in chars:
if ord(c) in range(ord(r[0]), ord(r[2])+1):
chars = chars.replace(c, '')
s = u'[^{}]'.format(u''.join(sorted(rs) + sorted({c for c in chars})))
if expr is None:
return Re(s)
else:
return Re(expr + s)
[docs]def range(start, end):
if isinstance(start, int):
start = str(start)
if isinstance(end, int):
end = str(end)
sl = start.lower()
el = end.lower()
su = start.upper()
eu = end.upper()
if start == sl and end == el:
return u'{}-{}'.format(start, end)
elif start == su and end == eu:
return u'{}-{}'.format(start, end)
else:
return u'{}-{}{}-{}'.format(sl, el, su, eu)
# Anchors
BEGINNING = r'^'
END = r'$'
WORD_BOUNDARY = r'\b'
NOT_WORD_BOUNDARY = r'\B'
[docs]class Anchors:
BEGINNING = BEGINNING
END = END
WORD_BOUNDARY = WORD_BOUNDARY
NOT_WORD_BOUNDARY = NOT_WORD_BOUNDARY
# Escaped characters
[docs]def escape_octal(triplet):
from string import octdigits
if PY2:
from __builtin__ import range
else:
from builtins import range
if not isinstance(triplet, string_types):
triplet = oct(triplet)
if PY2:
triplet = triplet if len(triplet) == 1 else triplet[1:]
else:
triplet = triplet if len(triplet) == 1 else triplet[2:]
lt = len(triplet)
if lt > 3:
raise ValueError(u'octal has more than three digits: {}'.format(triplet))
elif lt < 3:
zeros = u''.join('0' for _ in range(3-lt))
triplet = u'{}{}'.format(zeros, triplet)
if not all(oct in octdigits for oct in triplet):
raise ValueError(u'invalid literal for octal: \'{}\''.format(triplet))
return Re(u'\\{}'.format(triplet))
[docs]def escape_hexadecimal(pair):
from string import hexdigits
if PY2:
from __builtin__ import range
else:
from builtins import range
if not isinstance(pair, string_types):
pair = hex(pair)
pair = pair if len(pair) == 1 else pair[2:]
lt = len(pair)
if lt > 2:
raise ValueError(u'hex has more than two digits: {}'.format(pair))
elif lt < 2:
zeros = u''.join('0' for _ in range(2-lt))
pair = u'{}{}'.format(zeros, pair)
if not all(hex in hexdigits for hex in pair):
raise ValueError(u'invalid literal for hex: \'{}\''.format(pair))
return Re(u'\\x{}'.format(pair.upper()))
[docs]def escape_unicode(quadruple):
from string import hexdigits
if PY2:
from __builtin__ import range
else:
from builtins import range
if not isinstance(quadruple, string_types):
quadruple = hex(quadruple)
quadruple = quadruple if len(quadruple) == 1 else quadruple[2:]
lt = len(quadruple)
if lt > 4:
raise ValueError(u'hex has more than four digits: {}'.format(quadruple))
elif lt < 4:
zeros = u''.join('0' for _ in range(4-lt))
quadruple = u'{}{}'.format(zeros, quadruple)
if not all(hex in hexdigits for hex in quadruple):
raise ValueError(u'invalid literal for hex: \'{}\''.format(quadruple))
return Re(u'\\u{}'.format(quadruple.upper()))
[docs]def escape_control_char(char):
if not isinstance(char, string_types):
char = chr(char)
char = char.upper()
if char < 'A':
raise ValueError(
u'control character should be bigger than \'A\'({}): \'{}\'({})'.
format(ord('A'), char, ord(char)))
if char > 'Z':
raise ValueError(
u'control character should be smaller than \'Z\'({}): {}({})'.
format(ord('Z'), char, ord(char)))
return u'\\c{}'.format(char)
[docs]def escape(expr):
return Re(re.escape(expr))
[docs]class Escaped:
TAB = r'\t'
LINE_FEED = r'\n'
VERTICAL_TAB = r'\v'
FORM_FEED = r'\f'
CARRIAGE_RETURN = r'\r'
NULL = r'\0'
# Groups & Lookaround
[docs]def group_capturing(expr):
return Re(u'({})'.format(expr))
[docs]def group_non_capturing(expr):
return Re(u'(?:{})'.format(expr))
[docs]def group(expr, capturing=True):
if capturing:
return group_capturing(expr)
else:
return group_non_capturing(expr)
[docs]def backreference(n):
if not isinstance(n, int):
raise TypeError(u'n must be <type int>: {}'.format(type(n)))
if n < 0:
raise ValueError(u'n must be positive: {}'.format(n))
return Re(u'\\{}'.format(n))
[docs]def lookahead_positive(expr, lookup):
return Re(u'{}(?={})'.format(expr, lookup))
[docs]def lookahead_negative(expr, lookup):
return Re(u'{}(?!{})'.format(expr, lookup))
[docs]def lookahead(expr, lookup, positive=True):
if positive:
return lookahead_positive(expr, lookup)
else:
return lookahead_negative(expr, lookup)
[docs]def lookbehind_positive(expr, lookup):
return Re(u'{}(?<={})'.format(expr, lookup))
[docs]def lookbehind_negative(expr, lookup):
return Re(u'{}(?<!{})'.format(expr, lookup))
[docs]def lookbehind(expr, loookup, positive=True):
if positive:
return lookbehind_positive(expr, loookup)
else:
return lookbehind_negative(expr, loookup)
# Quantifieres & Alternation
PLUS = r'+'
STAR = r'*'
OPTIONAL = r'?'
[docs]class Quantifier:
PLUS = PLUS
STAR = STAR
OPTIONAL = OPTIONAL
[docs]def plus(expr):
return QuantifiedRe(u'{}+'.format(expr))
[docs]def star(expr):
return QuantifiedRe(u'{}*'.format(expr))
[docs]def optional(expr):
return QuantifiedRe(u'{}?'.format(expr))
[docs]def quantify(expr, min, max=None):
if not isinstance(min, int):
int(min)
if max is None or max is False:
max = ''
else:
if not isinstance(max, int) and max != '':
int(max)
return QuantifiedRe(u'{}{}{},{}{}'.format(expr, u'{', min, max, u'}'))
[docs]def quantify_lazy(expr, min, max=None):
return Re(u'{}?'.format(quantify(expr, min, max)))
[docs]def lazy(quantified_expr):
# if not isinstance(quantified_expr, QuantifiedRe):
# raise TypeError(u'must be a <type QuantifiedRe>: {}'.format(type(quantified_expr)))
return Re(u'{}?'.format(quantified_expr))
ALTERNATION = r'|'
[docs]def alternate(left, right):
return u'{}|{}'.format(left, right)
# Substitution
[docs]class Substitution:
MATCH = r'$&'
BEFORE_MATCH = r'$`'
AFTER_MATCH = r"$'"
DOLLAR = r'$$'
[docs] def capture_group(n):
return r'${}'.format(n)
# Flags
[docs]class Flags:
IGNORE_CASE = 'i'
GLOBAL_SEARCH = 'g'
MULTILINE = 'm'
[docs]class Re(unicode if PY2 else str):
def __init__(self, expr=None):
if expr is None:
self.expr = u''
else:
if PY2:
self.expr = unicode(expr)
else:
self.expr = str(expr)
def __eq__(self, other):
return repr(self) == repr(other)
def __hash__(self):
return hash(self.expr)
def __add__(self, other):
return Re(u'{}{}'.format(self.expr, other.expr))
def __and__(self, other):
return Re(u'{}{}'.format(self.expr, other.expr))
def __or__(self, other):
return Re(u'{}|{}'.format(self, other))
def __coerce__(self, other):
if not isinstance(self, Re):
self = Re(repr(self))
if not isinstance(self, Re):
other = Re(repr(other))
return (self, other)
[docs] def add_set(self, chars):
return self + set(chars)
[docs] def add_negated_set(self, chars):
return self + negated_set(chars)
[docs] def set(self, chars=None):
return set(self.expr + (chars if chars is not None else ''))
[docs] def negated_set(self):
return negated_set(self.expr)
[docs] def add_escaped_octal(self, triplet):
return self + escape_octal(triplet)
[docs] def add_escaped_hexadecimal(self, pair):
return self + escape_hexadecimal(pair)
[docs] def add_escaped_unicode(self, quadruple):
return self + escape_unicode(quadruple)
[docs] def add_escaped_control_char(self, char):
return self + escape_control_char(char)
[docs] def add(self, expr, escape=False):
if escape:
return self + escape(expr)
else:
return self + Re(expr)
[docs] def escape(self):
return escape(self.expr)
[docs] def group(self, capturing=True):
return group(self.expr, capturing)
[docs] def add_backreference(self, n):
return self + backreference(n)
[docs] def lookahead(self, lookup, positive=True):
return lookahead(self.expr, lookup, positive)
[docs] def lookbehind(self, lookup, positive=True):
return lookbehind(self.expr, lookup, positive)
[docs] def plus(self):
return plus(self.expr)
[docs] def star(self):
return star(self.expr)
[docs] def optional(self):
return optional(self.expr)
[docs] def quantify(self, min, max=None):
return quantify(self.expr, min, max)
[docs]class QuantifiedRe(Re):
[docs] def lazy(self):
return lazy(self.expr)
[docs]class Generator():
def __init__(self, options, dictionary):
self.options = options
[docs] def generate(self, input):
pass
[docs]def generate(str,
start=False, end=False,
match=None, quantifier='{0,}', setLike=False,
dictionary=None):
"""Generates a python string for regular-expressions."""
if str == '':
return ''
result = ''
if dictionary is not None and start is False:
str = lookup(str, dictionary)
elif dictionary is not None and start is True:
str = [str[0]] + lookup(str[1:], dictionary)
l = len(str)
s = str[0]
m = str[1:-1] if (l >= 3) else None
e = str[-1:][0] if (l >= 2) else None
sg = False
for c in s:
sg = sg or True if '|' in c else sg or False
mg = False
if m is not None:
for c in m:
mg = mg or True if '|' in c else mg or False
eg = False
if e is not None:
for c in e:
eg = eg or True if '|' in c else eg or False
if match is None:
w = ''
else:
w = match + quantifier + '?'
# print('')
# print('start', s)
# print('middle', m)
# print('end', e)
# print('wide', w)
if setLike:
strSet = ''
setChars = str
if start:
setChars = setChars[1:]
if end and e is not None:
setChars = setChars[:-1]
for c in setChars:
strSet += c + '|'
if '|' in strSet[:-1]:
strSet = '(' + strSet[:-1] + ')'
else:
strSet = strSet[:-1]
if start:
if sg:
s = '(' + s + ')' + w
else:
s = re.escape(s) + w
else:
s = strSet + w
if m is not None:
tmpM = ''
for _ in m:
tmpM += strSet + w
m = tmpM
if end and e is not None:
if eg:
e = '(' + e + ')'
else:
e = re.escape(e)
elif e is not None:
e = strSet + w
else:
if sg:
s = '(' + s + ')' + w
else:
s = re.escape(s) + w
if m is not None:
tmpM = ''
for c in m:
if mg:
tmpM += '(' + c +')' + w
else:
tmpM += re.escape(c) + w
m = tmpM
if end and e is not None:
if eg:
e = '(' + e + ')'
else:
e = re.escape(e)
elif e is not None:
if eg:
e = '(' + e + ')' + w
else:
e = re.escape(e) + w
result = s
if (m is not None):
result += m
if (e is not None):
result += e
if start:
result = '^' + result
if end:
result = result + '$'
return result