Files
game-backuper/game_backuper/_pcre2.pyx
2021-09-12 21:12:46 +08:00

437 lines
15 KiB
Cython

from ._pcre2 cimport *
from ._Python cimport *
from enum import IntFlag
from libc.stdlib cimport malloc, free
from libc.string cimport strcpy
from cpython.mem cimport PyMem_Free
try:
from functools import cached_property
except ImportError:
cached_property = property
cdef extern from "_pcre2.h":
const uint32_t _PCRE2_ANCHORED "PCRE2_ANCHORED"
const uint32_t _PCRE2_ALLOW_EMPTY_CLASS "PCRE2_ALLOW_EMPTY_CLASS"
const uint32_t _PCRE2_ALT_BSUX "PCRE2_ALT_BSUX"
const uint32_t _PCRE2_ALT_CIRCUMFLEX "PCRE2_ALT_CIRCUMFLEX"
const uint32_t _PCRE2_ALT_VERBNAMES "PCRE2_ALT_VERBNAMES"
const uint32_t _PCRE2_AUTO_CALLOUT "PCRE2_AUTO_CALLOUT"
const uint32_t _PCRE2_CASELESS "PCRE2_CASELESS"
const uint32_t _PCRE2_COPY_MATCHED_SUBJECT "PCRE2_COPY_MATCHED_SUBJECT"
const uint32_t _PCRE2_DOLLAR_ENDONLY "PCRE2_DOLLAR_ENDONLY"
const uint32_t _PCRE2_DOTALL "PCRE2_DOTALL"
const uint32_t _PCRE2_DUPNAMES "PCRE2_DUPNAMES"
const uint32_t _PCRE2_ENDANCHORED "PCRE2_ENDANCHORED"
const uint32_t _PCRE2_EXTENDED "PCRE2_EXTENDED"
const uint32_t _PCRE2_FIRSTLINE "PCRE2_FIRSTLINE"
const uint32_t _PCRE2_LITERAL "PCRE2_LITERAL"
const uint32_t _PCRE2_MATCH_INVALID_UTF "PCRE2_MATCH_INVALID_UTF"
const uint32_t _PCRE2_MATCH_UNSET_BACKREF "PCRE2_MATCH_UNSET_BACKREF"
const uint32_t _PCRE2_MULTILINE "PCRE2_MULTILINE"
const uint32_t _PCRE2_NEVER_BACKSLASH_C "PCRE2_NEVER_BACKSLASH_C"
const uint32_t _PCRE2_NEVER_UCP "PCRE2_NEVER_UCP"
const uint32_t _PCRE2_NEVER_UTF "PCRE2_NEVER_UTF"
const uint32_t _PCRE2_NOTBOL "PCRE2_NOTBOL"
const uint32_t _PCRE2_NOTEOL "PCRE2_NOTEOL"
const uint32_t _PCRE2_NOTEMPTY "PCRE2_NOTEMPTY"
const uint32_t _PCRE2_NOTEMPTY_ATSTART "PCRE2_NOTEMPTY_ATSTART"
const uint32_t _PCRE2_NO_AUTO_CAPTURE "PCRE2_NO_AUTO_CAPTURE"
const uint32_t _PCRE2_NO_AUTO_POSSESS "PCRE2_NO_AUTO_POSSESS"
const uint32_t _PCRE2_NO_DOTSTAR_ANCHOR "PCRE2_NO_DOTSTAR_ANCHOR"
const uint32_t _PCRE2_NO_JIT "PCRE2_NO_JIT"
const uint32_t _PCRE2_NO_START_OPTIMIZE "PCRE2_NO_START_OPTIMIZE"
const uint32_t _PCRE2_NO_UTF_CHECK "PCRE2_NO_UTF_CHECK"
const uint32_t _PCRE2_PARTIAL_HARD "PCRE2_PARTIAL_HARD"
const uint32_t _PCRE2_PARTIAL_SOFT "PCRE2_PARTIAL_SOFT"
const uint32_t _PCRE2_UCP "PCRE2_UCP"
const uint32_t _PCRE2_UNGREEDY "PCRE2_UNGREEDY"
const uint32_t _PCRE2_USE_OFFSET_LIMIT "PCRE2_USE_OFFSET_LIMIT"
const uint32_t _PCRE2_UTF "PCRE2_UTF"
const size_t PCRE2_ZERO_TERMINATED
const int PCRE2_ERROR_NOMEMORY
const int PCRE2_ERROR_UNSET
const int PCRE2_INFO_NAMECOUNT
const int PCRE2_INFO_NAMEENTRYSIZE
const int PCRE2_INFO_NAMETABLE
class Option(IntFlag):
PCRE2_ANCHORED = _PCRE2_ANCHORED
PCRE2_ALLOW_EMPTY_CLASS = _PCRE2_ALLOW_EMPTY_CLASS
PCRE2_ALT_BSUX = _PCRE2_ALT_BSUX
PCRE2_ALT_CIRCUMFLEX = _PCRE2_ALT_CIRCUMFLEX
PCRE2_ALT_VERBNAMES = _PCRE2_ALT_VERBNAMES
PCRE2_AUTO_CALLOUT = _PCRE2_AUTO_CALLOUT
PCRE2_CASELESS = _PCRE2_CASELESS
PCRE2_DOLLAR_ENDONLY = _PCRE2_DOLLAR_ENDONLY
PCRE2_DOTALL = _PCRE2_DOTALL
PCRE2_DUPNAMES = _PCRE2_DUPNAMES
PCRE2_ENDANCHORED = _PCRE2_ENDANCHORED
PCRE2_EXTENDED = _PCRE2_EXTENDED
PCRE2_FIRSTLINE = _PCRE2_FIRSTLINE
PCRE2_LITERAL = _PCRE2_LITERAL
PCRE2_MATCH_INVALID_UTF = _PCRE2_MATCH_INVALID_UTF
PCRE2_MATCH_UNSET_BACKREF = _PCRE2_MATCH_UNSET_BACKREF
PCRE2_MULTILINE = _PCRE2_MULTILINE
PCRE2_NEVER_BACKSLASH_C = _PCRE2_NEVER_BACKSLASH_C
PCRE2_NEVER_UCP = _PCRE2_NEVER_UCP
PCRE2_NEVER_UTF = _PCRE2_NEVER_UTF
PCRE2_NO_AUTO_CAPTURE = _PCRE2_NO_AUTO_CAPTURE
PCRE2_NO_AUTO_POSSESS = _PCRE2_NO_AUTO_POSSESS
PCRE2_NO_DOTSTAR_ANCHOR = _PCRE2_NO_DOTSTAR_ANCHOR
PCRE2_NO_START_OPTIMIZE = _PCRE2_NO_START_OPTIMIZE
PCRE2_NO_UTF_CHECK = _PCRE2_NO_UTF_CHECK
PCRE2_UCP = _PCRE2_UCP
PCRE2_UNGREEDY = _PCRE2_UNGREEDY
PCRE2_USE_OFFSET_LIMIT = _PCRE2_USE_OFFSET_LIMIT
PCRE2_UTF = _PCRE2_UTF
class MatchOption(IntFlag):
PCRE2_ANCHORED = _PCRE2_ANCHORED
PCRE2_COPY_MATCHED_SUBJECT = _PCRE2_COPY_MATCHED_SUBJECT
PCRE2_ENDANCHORED = _PCRE2_ENDANCHORED
PCRE2_NOTBOL = _PCRE2_NOTBOL
PCRE2_NOTEOL = _PCRE2_NOTEOL
PCRE2_NOTEMPTY = _PCRE2_NOTEMPTY
PCRE2_NOTEMPTY_ATSTART = _PCRE2_NOTEMPTY_ATSTART
PCRE2_NO_JIT = _PCRE2_NO_JIT
PCRE2_NO_UTF_CHECK = _PCRE2_NO_UTF_CHECK
PCRE2_PARTIAL_HARD = _PCRE2_PARTIAL_HARD
PCRE2_PARTIAL_SOFT = _PCRE2_PARTIAL_SOFT
cdef class Match:
cdef pcre2_match_data* data
cdef object inp
cdef object r
def __cinit__(self):
self.data = NULL
def __dealloc__(self):
if self.data != NULL:
pcre2_match_data_free(self.data)
self.data = NULL
def __getitem__(self, uint32_t i):
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
if i >= count:
raise IndexError(u'No such group')
cdef PCRE2_UCHAR *buf
cdef PCRE2_SIZE le
cdef PCRE2_UCHAR* errbuf
cdef size_t errsize = sizeof(PCRE2_UCHAR) * 1024
cdef int re = pcre2_substring_get_bynumber(self.data, i, &buf, &le)
if re == 0:
s = buf.decode()
pcre2_substring_free(buf)
return s
elif re == PCRE2_ERROR_UNSET:
return None
elif re == PCRE2_ERROR_NOMEMORY:
raise MemoryError()
else:
errbuf = <PCRE2_UCHAR*> malloc(errsize)
if errbuf == NULL:
raise MemoryError()
s = None
if pcre2_get_error_message(re, errbuf, errsize) > 0:
s = errbuf.decode()
free(errbuf)
raise ValueError(s if s else u'Can not get substring.')
def __init__(self, unicode inp, r):
self.inp = inp
self.r = r
def end(self) -> int:
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_SIZE* vect = pcre2_get_ovector_pointer(self.data)
return vect[1]
@cached_property
def endpos(self) -> int:
return self.end()
def group(self) -> str:
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_UCHAR *buf
cdef PCRE2_SIZE le
cdef PCRE2_UCHAR* errbuf
cdef size_t errsize = sizeof(PCRE2_UCHAR) * 1024
cdef int re = pcre2_substring_get_bynumber(self.data, 0, &buf, &le)
if re == 0:
s = buf.decode()
pcre2_substring_free(buf)
return s
elif re == PCRE2_ERROR_NOMEMORY:
raise MemoryError()
else:
errbuf = <PCRE2_UCHAR*> malloc(errsize)
if errbuf == NULL:
raise MemoryError()
s = None
if pcre2_get_error_message(re, errbuf, errsize) > 0:
s = errbuf.decode()
free(errbuf)
raise ValueError(s if s else u'Can not get substring.')
def groupdict(self):
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_UCHAR **li
cdef re = pcre2_substring_list_get(self.data, &li, NULL)
if re == PCRE2_ERROR_NOMEMORY:
raise MemoryError()
elif re != 0:
raise ValueError(u'Unexpected error')
d = {}
cdef size_t i = 1
t = self.r.namedtable()
while li[i] != NULL:
if i not in t:
i += 1
continue
if li[i][0] == 0 and pcre2_substring_length_bynumber(self.data, i, NULL) != 0:
if t[i] not in d:
d[t[i]] = None
else:
if t[i] not in d or d[t[i]] is None:
d[t[i]] = li[i].decode()
i += 1
while i < count:
if i in t:
if t[i] not in d:
d[t[i]] = None
i += 1
pcre2_substring_list_free(li)
return d
def groups(self):
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_UCHAR **li
cdef re = pcre2_substring_list_get(self.data, &li, NULL)
if re == PCRE2_ERROR_NOMEMORY:
raise MemoryError()
elif re != 0:
raise ValueError(u'Unexpected error')
cdef size_t i = 1
l = []
while li[i] != NULL:
if li[i][0] == 0 and pcre2_substring_length_bynumber(self.data, i, NULL) != 0:
l.append(None)
else:
l.append(li[i].decode())
i += 1
while i < count:
l.append(None)
i += 1
pcre2_substring_list_free(li)
return tuple(l)
@cached_property
def lastgroup(self) -> str:
regs = self.regs
cdef uint32_t le = len(regs)
cdef uint32_t i = le - 1
while i >= 0:
if regs[i][0] == -1:
i -= 1
continue
break
if i == 0:
return None
t = self.r.namedtable()
if i in t:
return t[i]
@cached_property
def lastindex(self) -> int:
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
return count - 1
@cached_property
def pos(self) -> int:
return self.start()
@cached_property
def re(self):
return self.r
@cached_property
def regs(self):
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_SIZE* vect = pcre2_get_ovector_pointer(self.data)
l = []
cdef uint32_t i = 0
while i < count:
if vect[i * 2] == <PCRE2_SIZE> -1:
l.append((-1, -1))
else:
l.append((vect[i * 2], vect[i * 2 + 1]))
i += 1
return tuple(l)
def span(self) -> (int, int):
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_SIZE* vect = pcre2_get_ovector_pointer(self.data)
return (vect[0], vect[1])
def start(self) -> int:
if self.data == NULL:
raise ValueError(u'No matched data.')
cdef uint32_t count = pcre2_get_ovector_count(self.data)
if count <= 0:
raise ValueError(u'No match')
cdef PCRE2_SIZE* vect = pcre2_get_ovector_pointer(self.data)
return vect[0]
@cached_property
def string(self) -> str:
return self.inp
cdef set_data(self, pcre2_match_data* data):
if data == NULL:
raise ValueError(u'data is NULL.')
self.data = data
cdef class PCRE2:
cdef pcre2_code* code
def __cinit__(self):
self.code = NULL
def __dealloc__(self):
if self.code != NULL:
pcre2_code_free(self.code)
self.code = NULL
def __init__(self, unicode inp, opt: Option = None):
if inp is None:
raise ValueError(u'Empty pattern.')
cdef uint32_t opts = _PCRE2_UTF | _PCRE2_ALT_BSUX
cdef int err
cdef PCRE2_SIZE erroffset
cdef PCRE2_UCHAR* errbuf
cdef size_t errsize = sizeof(PCRE2_UCHAR) * 1024
if isinstance(opt, Option):
opts = opt.value
elif isinstance(opt, int):
opts = opt
cdef pcre2_code* re = pcre2_compile(<PCRE2_SPTR>PyUnicode_AsUTF8(inp),
PCRE2_ZERO_TERMINATED, opts, &err,
&erroffset, NULL)
if re is NULL:
errbuf = <PCRE2_UCHAR*> malloc(errsize)
if errbuf == NULL:
raise MemoryError()
s = None
if pcre2_get_error_message(err, errbuf, errsize) > 0:
s = u"Error at offset %d: %s" % (erroffset, errbuf.decode())
free(errbuf)
raise ValueError(s if s else u'Not invalid')
self.code = re
def match(self, unicode inp, opt: MatchOption = None, PCRE2_SIZE startoffset = 0, int search_only = 0) -> Match:
if inp is None:
raise ValueError(u'Empty input.')
if self.code == NULL:
raise ValueError(u'pattern is NULL.')
cdef uint32_t opts = 0
if isinstance(opt, MatchOption):
opts = opt.value
elif isinstance(opt, int):
opts = opt
cdef pcre2_match_data* data = pcre2_match_data_create_from_pattern(self.code, NULL)
cdef PCRE2_UCHAR* errbuf
cdef size_t errsize = sizeof(PCRE2_UCHAR) * 1024
if data == NULL:
raise MemoryError()
cdef int re = pcre2_match(self.code, <PCRE2_SPTR>PyUnicode_AsUTF8(inp),
PCRE2_ZERO_TERMINATED, startoffset, opts, data, NULL)
if re <= 0:
pcre2_match_data_free(data)
data = NULL
if re == -1: # No match
if search_only:
return False
else:
return None
elif re == 0:
raise ValueError(u'The vector of offsets is too small')
else:
errbuf = <PCRE2_UCHAR*> malloc(errsize)
if errbuf == NULL:
raise MemoryError()
s = None
if pcre2_get_error_message(re, errbuf, errsize) > 0:
s = errbuf.decode()
free(errbuf)
raise ValueError(s if s else u'Can not match')
if search_only:
pcre2_match_data_free(data)
return True
m = Match(inp, self)
m.set_data(data)
return m
def namedtable(self):
if self.code == NULL:
raise ValueError(u'pattern is NULL.')
cdef uint32_t count
cdef uint32_t ensize
if pcre2_pattern_info(self.code, PCRE2_INFO_NAMECOUNT, &count) != 0:
raise ValueError(u'Can not get namedtable')
if pcre2_pattern_info(self.code, PCRE2_INFO_NAMEENTRYSIZE, &ensize) != 0:
raise ValueError(u'Can not get namedtable')
if count <= 0 or ensize <= 0:
raise ValueError(u'Can not get namedtable')
cdef PCRE2_SPTR buf
if pcre2_pattern_info(self.code, PCRE2_INFO_NAMETABLE, &buf) != 0:
raise ValueError(u'Can not get namedtable')
cdef uint32_t i = 0
cdef size_t ind = 0
cdef char* tmp = <char*> malloc(ensize)
if tmp == NULL:
raise MemoryError()
d = {}
while i < count:
ind = i * ensize
strcpy(tmp, <char*>buf + ind + 2)
d[buf[ind] * 256 + buf[ind + 1]] = tmp.decode()
i += 1
free(tmp)
return d