logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

sre_compile.py (26695B)


  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert template to internal format
  5. #
  6. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. import _sre
  12. import sre_parse
  13. from sre_constants import *
  14. assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  15. _LITERAL_CODES = {LITERAL, NOT_LITERAL}
  16. _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
  17. _SUCCESS_CODES = {SUCCESS, FAILURE}
  18. _ASSERT_CODES = {ASSERT, ASSERT_NOT}
  19. _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
  20. # Sets of lowercase characters which have the same uppercase.
  21. _equivalences = (
  22. # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
  23. (0x69, 0x131), # iı
  24. # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
  25. (0x73, 0x17f), # sſ
  26. # MICRO SIGN, GREEK SMALL LETTER MU
  27. (0xb5, 0x3bc), # µμ
  28. # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
  29. (0x345, 0x3b9, 0x1fbe), # \u0345ιι
  30. # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
  31. (0x390, 0x1fd3), # ΐΐ
  32. # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
  33. (0x3b0, 0x1fe3), # ΰΰ
  34. # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
  35. (0x3b2, 0x3d0), # βϐ
  36. # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
  37. (0x3b5, 0x3f5), # εϵ
  38. # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
  39. (0x3b8, 0x3d1), # θϑ
  40. # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
  41. (0x3ba, 0x3f0), # κϰ
  42. # GREEK SMALL LETTER PI, GREEK PI SYMBOL
  43. (0x3c0, 0x3d6), # πϖ
  44. # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
  45. (0x3c1, 0x3f1), # ρϱ
  46. # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
  47. (0x3c2, 0x3c3), # ςσ
  48. # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
  49. (0x3c6, 0x3d5), # φϕ
  50. # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
  51. (0x1e61, 0x1e9b), # ṡẛ
  52. # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
  53. (0xfb05, 0xfb06), # ſtst
  54. )
  55. # Maps the lowercase code to lowercase codes which have the same uppercase.
  56. _ignorecase_fixes = {i: tuple(j for j in t if i != j)
  57. for t in _equivalences for i in t}
  58. def _combine_flags(flags, add_flags, del_flags,
  59. TYPE_FLAGS=sre_parse.TYPE_FLAGS):
  60. if add_flags & TYPE_FLAGS:
  61. flags &= ~TYPE_FLAGS
  62. return (flags | add_flags) & ~del_flags
  63. def _compile(code, pattern, flags):
  64. # internal: compile a (sub)pattern
  65. emit = code.append
  66. _len = len
  67. LITERAL_CODES = _LITERAL_CODES
  68. REPEATING_CODES = _REPEATING_CODES
  69. SUCCESS_CODES = _SUCCESS_CODES
  70. ASSERT_CODES = _ASSERT_CODES
  71. iscased = None
  72. tolower = None
  73. fixes = None
  74. if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
  75. if flags & SRE_FLAG_UNICODE:
  76. iscased = _sre.unicode_iscased
  77. tolower = _sre.unicode_tolower
  78. fixes = _ignorecase_fixes
  79. else:
  80. iscased = _sre.ascii_iscased
  81. tolower = _sre.ascii_tolower
  82. for op, av in pattern:
  83. if op in LITERAL_CODES:
  84. if not flags & SRE_FLAG_IGNORECASE:
  85. emit(op)
  86. emit(av)
  87. elif flags & SRE_FLAG_LOCALE:
  88. emit(OP_LOCALE_IGNORE[op])
  89. emit(av)
  90. elif not iscased(av):
  91. emit(op)
  92. emit(av)
  93. else:
  94. lo = tolower(av)
  95. if not fixes: # ascii
  96. emit(OP_IGNORE[op])
  97. emit(lo)
  98. elif lo not in fixes:
  99. emit(OP_UNICODE_IGNORE[op])
  100. emit(lo)
  101. else:
  102. emit(IN_UNI_IGNORE)
  103. skip = _len(code); emit(0)
  104. if op is NOT_LITERAL:
  105. emit(NEGATE)
  106. for k in (lo,) + fixes[lo]:
  107. emit(LITERAL)
  108. emit(k)
  109. emit(FAILURE)
  110. code[skip] = _len(code) - skip
  111. elif op is IN:
  112. charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
  113. if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
  114. emit(IN_LOC_IGNORE)
  115. elif not hascased:
  116. emit(IN)
  117. elif not fixes: # ascii
  118. emit(IN_IGNORE)
  119. else:
  120. emit(IN_UNI_IGNORE)
  121. skip = _len(code); emit(0)
  122. _compile_charset(charset, flags, code)
  123. code[skip] = _len(code) - skip
  124. elif op is ANY:
  125. if flags & SRE_FLAG_DOTALL:
  126. emit(ANY_ALL)
  127. else:
  128. emit(ANY)
  129. elif op in REPEATING_CODES:
  130. if flags & SRE_FLAG_TEMPLATE:
  131. raise error("internal: unsupported template operator %r" % (op,))
  132. if _simple(av[2]):
  133. if op is MAX_REPEAT:
  134. emit(REPEAT_ONE)
  135. else:
  136. emit(MIN_REPEAT_ONE)
  137. skip = _len(code); emit(0)
  138. emit(av[0])
  139. emit(av[1])
  140. _compile(code, av[2], flags)
  141. emit(SUCCESS)
  142. code[skip] = _len(code) - skip
  143. else:
  144. emit(REPEAT)
  145. skip = _len(code); emit(0)
  146. emit(av[0])
  147. emit(av[1])
  148. _compile(code, av[2], flags)
  149. code[skip] = _len(code) - skip
  150. if op is MAX_REPEAT:
  151. emit(MAX_UNTIL)
  152. else:
  153. emit(MIN_UNTIL)
  154. elif op is SUBPATTERN:
  155. group, add_flags, del_flags, p = av
  156. if group:
  157. emit(MARK)
  158. emit((group-1)*2)
  159. # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
  160. _compile(code, p, _combine_flags(flags, add_flags, del_flags))
  161. if group:
  162. emit(MARK)
  163. emit((group-1)*2+1)
  164. elif op in SUCCESS_CODES:
  165. emit(op)
  166. elif op in ASSERT_CODES:
  167. emit(op)
  168. skip = _len(code); emit(0)
  169. if av[0] >= 0:
  170. emit(0) # look ahead
  171. else:
  172. lo, hi = av[1].getwidth()
  173. if lo != hi:
  174. raise error("look-behind requires fixed-width pattern")
  175. emit(lo) # look behind
  176. _compile(code, av[1], flags)
  177. emit(SUCCESS)
  178. code[skip] = _len(code) - skip
  179. elif op is CALL:
  180. emit(op)
  181. skip = _len(code); emit(0)
  182. _compile(code, av, flags)
  183. emit(SUCCESS)
  184. code[skip] = _len(code) - skip
  185. elif op is AT:
  186. emit(op)
  187. if flags & SRE_FLAG_MULTILINE:
  188. av = AT_MULTILINE.get(av, av)
  189. if flags & SRE_FLAG_LOCALE:
  190. av = AT_LOCALE.get(av, av)
  191. elif flags & SRE_FLAG_UNICODE:
  192. av = AT_UNICODE.get(av, av)
  193. emit(av)
  194. elif op is BRANCH:
  195. emit(op)
  196. tail = []
  197. tailappend = tail.append
  198. for av in av[1]:
  199. skip = _len(code); emit(0)
  200. # _compile_info(code, av, flags)
  201. _compile(code, av, flags)
  202. emit(JUMP)
  203. tailappend(_len(code)); emit(0)
  204. code[skip] = _len(code) - skip
  205. emit(FAILURE) # end of branch
  206. for tail in tail:
  207. code[tail] = _len(code) - tail
  208. elif op is CATEGORY:
  209. emit(op)
  210. if flags & SRE_FLAG_LOCALE:
  211. av = CH_LOCALE[av]
  212. elif flags & SRE_FLAG_UNICODE:
  213. av = CH_UNICODE[av]
  214. emit(av)
  215. elif op is GROUPREF:
  216. if not flags & SRE_FLAG_IGNORECASE:
  217. emit(op)
  218. elif flags & SRE_FLAG_LOCALE:
  219. emit(GROUPREF_LOC_IGNORE)
  220. elif not fixes: # ascii
  221. emit(GROUPREF_IGNORE)
  222. else:
  223. emit(GROUPREF_UNI_IGNORE)
  224. emit(av-1)
  225. elif op is GROUPREF_EXISTS:
  226. emit(op)
  227. emit(av[0]-1)
  228. skipyes = _len(code); emit(0)
  229. _compile(code, av[1], flags)
  230. if av[2]:
  231. emit(JUMP)
  232. skipno = _len(code); emit(0)
  233. code[skipyes] = _len(code) - skipyes + 1
  234. _compile(code, av[2], flags)
  235. code[skipno] = _len(code) - skipno
  236. else:
  237. code[skipyes] = _len(code) - skipyes + 1
  238. else:
  239. raise error("internal: unsupported operand type %r" % (op,))
  240. def _compile_charset(charset, flags, code):
  241. # compile charset subprogram
  242. emit = code.append
  243. for op, av in charset:
  244. emit(op)
  245. if op is NEGATE:
  246. pass
  247. elif op is LITERAL:
  248. emit(av)
  249. elif op is RANGE or op is RANGE_UNI_IGNORE:
  250. emit(av[0])
  251. emit(av[1])
  252. elif op is CHARSET:
  253. code.extend(av)
  254. elif op is BIGCHARSET:
  255. code.extend(av)
  256. elif op is CATEGORY:
  257. if flags & SRE_FLAG_LOCALE:
  258. emit(CH_LOCALE[av])
  259. elif flags & SRE_FLAG_UNICODE:
  260. emit(CH_UNICODE[av])
  261. else:
  262. emit(av)
  263. else:
  264. raise error("internal: unsupported set operator %r" % (op,))
  265. emit(FAILURE)
  266. def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
  267. # internal: optimize character set
  268. out = []
  269. tail = []
  270. charmap = bytearray(256)
  271. hascased = False
  272. for op, av in charset:
  273. while True:
  274. try:
  275. if op is LITERAL:
  276. if fixup:
  277. lo = fixup(av)
  278. charmap[lo] = 1
  279. if fixes and lo in fixes:
  280. for k in fixes[lo]:
  281. charmap[k] = 1
  282. if not hascased and iscased(av):
  283. hascased = True
  284. else:
  285. charmap[av] = 1
  286. elif op is RANGE:
  287. r = range(av[0], av[1]+1)
  288. if fixup:
  289. if fixes:
  290. for i in map(fixup, r):
  291. charmap[i] = 1
  292. if i in fixes:
  293. for k in fixes[i]:
  294. charmap[k] = 1
  295. else:
  296. for i in map(fixup, r):
  297. charmap[i] = 1
  298. if not hascased:
  299. hascased = any(map(iscased, r))
  300. else:
  301. for i in r:
  302. charmap[i] = 1
  303. elif op is NEGATE:
  304. out.append((op, av))
  305. else:
  306. tail.append((op, av))
  307. except IndexError:
  308. if len(charmap) == 256:
  309. # character set contains non-UCS1 character codes
  310. charmap += b'\0' * 0xff00
  311. continue
  312. # Character set contains non-BMP character codes.
  313. if fixup:
  314. hascased = True
  315. # There are only two ranges of cased non-BMP characters:
  316. # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
  317. # and for both ranges RANGE_UNI_IGNORE works.
  318. if op is RANGE:
  319. op = RANGE_UNI_IGNORE
  320. tail.append((op, av))
  321. break
  322. # compress character map
  323. runs = []
  324. q = 0
  325. while True:
  326. p = charmap.find(1, q)
  327. if p < 0:
  328. break
  329. if len(runs) >= 2:
  330. runs = None
  331. break
  332. q = charmap.find(0, p)
  333. if q < 0:
  334. runs.append((p, len(charmap)))
  335. break
  336. runs.append((p, q))
  337. if runs is not None:
  338. # use literal/range
  339. for p, q in runs:
  340. if q - p == 1:
  341. out.append((LITERAL, p))
  342. else:
  343. out.append((RANGE, (p, q - 1)))
  344. out += tail
  345. # if the case was changed or new representation is more compact
  346. if hascased or len(out) < len(charset):
  347. return out, hascased
  348. # else original character set is good enough
  349. return charset, hascased
  350. # use bitmap
  351. if len(charmap) == 256:
  352. data = _mk_bitmap(charmap)
  353. out.append((CHARSET, data))
  354. out += tail
  355. return out, hascased
  356. # To represent a big charset, first a bitmap of all characters in the
  357. # set is constructed. Then, this bitmap is sliced into chunks of 256
  358. # characters, duplicate chunks are eliminated, and each chunk is
  359. # given a number. In the compiled expression, the charset is
  360. # represented by a 32-bit word sequence, consisting of one word for
  361. # the number of different chunks, a sequence of 256 bytes (64 words)
  362. # of chunk numbers indexed by their original chunk position, and a
  363. # sequence of 256-bit chunks (8 words each).
  364. # Compression is normally good: in a typical charset, large ranges of
  365. # Unicode will be either completely excluded (e.g. if only cyrillic
  366. # letters are to be matched), or completely included (e.g. if large
  367. # subranges of Kanji match). These ranges will be represented by
  368. # chunks of all one-bits or all zero-bits.
  369. # Matching can be also done efficiently: the more significant byte of
  370. # the Unicode character is an index into the chunk number, and the
  371. # less significant byte is a bit index in the chunk (just like the
  372. # CHARSET matching).
  373. charmap = bytes(charmap) # should be hashable
  374. comps = {}
  375. mapping = bytearray(256)
  376. block = 0
  377. data = bytearray()
  378. for i in range(0, 65536, 256):
  379. chunk = charmap[i: i + 256]
  380. if chunk in comps:
  381. mapping[i // 256] = comps[chunk]
  382. else:
  383. mapping[i // 256] = comps[chunk] = block
  384. block += 1
  385. data += chunk
  386. data = _mk_bitmap(data)
  387. data[0:0] = [block] + _bytes_to_codes(mapping)
  388. out.append((BIGCHARSET, data))
  389. out += tail
  390. return out, hascased
  391. _CODEBITS = _sre.CODESIZE * 8
  392. MAXCODE = (1 << _CODEBITS) - 1
  393. _BITS_TRANS = b'0' + b'1' * 255
  394. def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
  395. s = bits.translate(_BITS_TRANS)[::-1]
  396. return [_int(s[i - _CODEBITS: i], 2)
  397. for i in range(len(s), 0, -_CODEBITS)]
  398. def _bytes_to_codes(b):
  399. # Convert block indices to word array
  400. a = memoryview(b).cast('I')
  401. assert a.itemsize == _sre.CODESIZE
  402. assert len(a) * a.itemsize == len(b)
  403. return a.tolist()
  404. def _simple(p):
  405. # check if this subpattern is a "simple" operator
  406. if len(p) != 1:
  407. return False
  408. op, av = p[0]
  409. if op is SUBPATTERN:
  410. return av[0] is None and _simple(av[-1])
  411. return op in _UNIT_CODES
  412. def _generate_overlap_table(prefix):
  413. """
  414. Generate an overlap table for the following prefix.
  415. An overlap table is a table of the same size as the prefix which
  416. informs about the potential self-overlap for each index in the prefix:
  417. - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
  418. - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
  419. prefix[0:k]
  420. """
  421. table = [0] * len(prefix)
  422. for i in range(1, len(prefix)):
  423. idx = table[i - 1]
  424. while prefix[i] != prefix[idx]:
  425. if idx == 0:
  426. table[i] = 0
  427. break
  428. idx = table[idx - 1]
  429. else:
  430. table[i] = idx + 1
  431. return table
  432. def _get_iscased(flags):
  433. if not flags & SRE_FLAG_IGNORECASE:
  434. return None
  435. elif flags & SRE_FLAG_UNICODE:
  436. return _sre.unicode_iscased
  437. else:
  438. return _sre.ascii_iscased
  439. def _get_literal_prefix(pattern, flags):
  440. # look for literal prefix
  441. prefix = []
  442. prefixappend = prefix.append
  443. prefix_skip = None
  444. iscased = _get_iscased(flags)
  445. for op, av in pattern.data:
  446. if op is LITERAL:
  447. if iscased and iscased(av):
  448. break
  449. prefixappend(av)
  450. elif op is SUBPATTERN:
  451. group, add_flags, del_flags, p = av
  452. flags1 = _combine_flags(flags, add_flags, del_flags)
  453. if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
  454. break
  455. prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
  456. if prefix_skip is None:
  457. if group is not None:
  458. prefix_skip = len(prefix)
  459. elif prefix_skip1 is not None:
  460. prefix_skip = len(prefix) + prefix_skip1
  461. prefix.extend(prefix1)
  462. if not got_all:
  463. break
  464. else:
  465. break
  466. else:
  467. return prefix, prefix_skip, True
  468. return prefix, prefix_skip, False
  469. def _get_charset_prefix(pattern, flags):
  470. while True:
  471. if not pattern.data:
  472. return None
  473. op, av = pattern.data[0]
  474. if op is not SUBPATTERN:
  475. break
  476. group, add_flags, del_flags, pattern = av
  477. flags = _combine_flags(flags, add_flags, del_flags)
  478. if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
  479. return None
  480. iscased = _get_iscased(flags)
  481. if op is LITERAL:
  482. if iscased and iscased(av):
  483. return None
  484. return [(op, av)]
  485. elif op is BRANCH:
  486. charset = []
  487. charsetappend = charset.append
  488. for p in av[1]:
  489. if not p:
  490. return None
  491. op, av = p[0]
  492. if op is LITERAL and not (iscased and iscased(av)):
  493. charsetappend((op, av))
  494. else:
  495. return None
  496. return charset
  497. elif op is IN:
  498. charset = av
  499. if iscased:
  500. for op, av in charset:
  501. if op is LITERAL:
  502. if iscased(av):
  503. return None
  504. elif op is RANGE:
  505. if av[1] > 0xffff:
  506. return None
  507. if any(map(iscased, range(av[0], av[1]+1))):
  508. return None
  509. return charset
  510. return None
  511. def _compile_info(code, pattern, flags):
  512. # internal: compile an info block. in the current version,
  513. # this contains min/max pattern width, and an optional literal
  514. # prefix or a character map
  515. lo, hi = pattern.getwidth()
  516. if hi > MAXCODE:
  517. hi = MAXCODE
  518. if lo == 0:
  519. code.extend([INFO, 4, 0, lo, hi])
  520. return
  521. # look for a literal prefix
  522. prefix = []
  523. prefix_skip = 0
  524. charset = [] # not used
  525. if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
  526. # look for literal prefix
  527. prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
  528. # if no prefix, look for charset prefix
  529. if not prefix:
  530. charset = _get_charset_prefix(pattern, flags)
  531. ## if prefix:
  532. ## print("*** PREFIX", prefix, prefix_skip)
  533. ## if charset:
  534. ## print("*** CHARSET", charset)
  535. # add an info block
  536. emit = code.append
  537. emit(INFO)
  538. skip = len(code); emit(0)
  539. # literal flag
  540. mask = 0
  541. if prefix:
  542. mask = SRE_INFO_PREFIX
  543. if prefix_skip is None and got_all:
  544. mask = mask | SRE_INFO_LITERAL
  545. elif charset:
  546. mask = mask | SRE_INFO_CHARSET
  547. emit(mask)
  548. # pattern length
  549. if lo < MAXCODE:
  550. emit(lo)
  551. else:
  552. emit(MAXCODE)
  553. prefix = prefix[:MAXCODE]
  554. emit(min(hi, MAXCODE))
  555. # add literal prefix
  556. if prefix:
  557. emit(len(prefix)) # length
  558. if prefix_skip is None:
  559. prefix_skip = len(prefix)
  560. emit(prefix_skip) # skip
  561. code.extend(prefix)
  562. # generate overlap table
  563. code.extend(_generate_overlap_table(prefix))
  564. elif charset:
  565. charset, hascased = _optimize_charset(charset)
  566. assert not hascased
  567. _compile_charset(charset, flags, code)
  568. code[skip] = len(code) - skip
  569. def isstring(obj):
  570. return isinstance(obj, (str, bytes))
  571. def _code(p, flags):
  572. flags = p.state.flags | flags
  573. code = []
  574. # compile info block
  575. _compile_info(code, p, flags)
  576. # compile the pattern
  577. _compile(code, p.data, flags)
  578. code.append(SUCCESS)
  579. return code
  580. def _hex_code(code):
  581. return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
  582. def dis(code):
  583. import sys
  584. labels = set()
  585. level = 0
  586. offset_width = len(str(len(code) - 1))
  587. def dis_(start, end):
  588. def print_(*args, to=None):
  589. if to is not None:
  590. labels.add(to)
  591. args += ('(to %d)' % (to,),)
  592. print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
  593. end=' '*(level-1))
  594. print(*args)
  595. def print_2(*args):
  596. print(end=' '*(offset_width + 2*level))
  597. print(*args)
  598. nonlocal level
  599. level += 1
  600. i = start
  601. while i < end:
  602. start = i
  603. op = code[i]
  604. i += 1
  605. op = OPCODES[op]
  606. if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
  607. MAX_UNTIL, MIN_UNTIL, NEGATE):
  608. print_(op)
  609. elif op in (LITERAL, NOT_LITERAL,
  610. LITERAL_IGNORE, NOT_LITERAL_IGNORE,
  611. LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
  612. LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
  613. arg = code[i]
  614. i += 1
  615. print_(op, '%#02x (%r)' % (arg, chr(arg)))
  616. elif op is AT:
  617. arg = code[i]
  618. i += 1
  619. arg = str(ATCODES[arg])
  620. assert arg[:3] == 'AT_'
  621. print_(op, arg[3:])
  622. elif op is CATEGORY:
  623. arg = code[i]
  624. i += 1
  625. arg = str(CHCODES[arg])
  626. assert arg[:9] == 'CATEGORY_'
  627. print_(op, arg[9:])
  628. elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
  629. skip = code[i]
  630. print_(op, skip, to=i+skip)
  631. dis_(i+1, i+skip)
  632. i += skip
  633. elif op in (RANGE, RANGE_UNI_IGNORE):
  634. lo, hi = code[i: i+2]
  635. i += 2
  636. print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
  637. elif op is CHARSET:
  638. print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
  639. i += 256//_CODEBITS
  640. elif op is BIGCHARSET:
  641. arg = code[i]
  642. i += 1
  643. mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
  644. for x in code[i: i + 256//_sre.CODESIZE]))
  645. print_(op, arg, mapping)
  646. i += 256//_sre.CODESIZE
  647. level += 1
  648. for j in range(arg):
  649. print_2(_hex_code(code[i: i + 256//_CODEBITS]))
  650. i += 256//_CODEBITS
  651. level -= 1
  652. elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
  653. GROUPREF_LOC_IGNORE):
  654. arg = code[i]
  655. i += 1
  656. print_(op, arg)
  657. elif op is JUMP:
  658. skip = code[i]
  659. print_(op, skip, to=i+skip)
  660. i += 1
  661. elif op is BRANCH:
  662. skip = code[i]
  663. print_(op, skip, to=i+skip)
  664. while skip:
  665. dis_(i+1, i+skip)
  666. i += skip
  667. start = i
  668. skip = code[i]
  669. if skip:
  670. print_('branch', skip, to=i+skip)
  671. else:
  672. print_(FAILURE)
  673. i += 1
  674. elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
  675. skip, min, max = code[i: i+3]
  676. if max == MAXREPEAT:
  677. max = 'MAXREPEAT'
  678. print_(op, skip, min, max, to=i+skip)
  679. dis_(i+3, i+skip)
  680. i += skip
  681. elif op is GROUPREF_EXISTS:
  682. arg, skip = code[i: i+2]
  683. print_(op, arg, skip, to=i+skip)
  684. i += 2
  685. elif op in (ASSERT, ASSERT_NOT):
  686. skip, arg = code[i: i+2]
  687. print_(op, skip, arg, to=i+skip)
  688. dis_(i+2, i+skip)
  689. i += skip
  690. elif op is INFO:
  691. skip, flags, min, max = code[i: i+4]
  692. if max == MAXREPEAT:
  693. max = 'MAXREPEAT'
  694. print_(op, skip, bin(flags), min, max, to=i+skip)
  695. start = i+4
  696. if flags & SRE_INFO_PREFIX:
  697. prefix_len, prefix_skip = code[i+4: i+6]
  698. print_2(' prefix_skip', prefix_skip)
  699. start = i + 6
  700. prefix = code[start: start+prefix_len]
  701. print_2(' prefix',
  702. '[%s]' % ', '.join('%#02x' % x for x in prefix),
  703. '(%r)' % ''.join(map(chr, prefix)))
  704. start += prefix_len
  705. print_2(' overlap', code[start: start+prefix_len])
  706. start += prefix_len
  707. if flags & SRE_INFO_CHARSET:
  708. level += 1
  709. print_2('in')
  710. dis_(start, i+skip)
  711. level -= 1
  712. i += skip
  713. else:
  714. raise ValueError(op)
  715. level -= 1
  716. dis_(0, len(code))
  717. def compile(p, flags=0):
  718. # internal: convert pattern list to internal format
  719. if isstring(p):
  720. pattern = p
  721. p = sre_parse.parse(p, flags)
  722. else:
  723. pattern = None
  724. code = _code(p, flags)
  725. if flags & SRE_FLAG_DEBUG:
  726. print()
  727. dis(code)
  728. # map in either direction
  729. groupindex = p.state.groupdict
  730. indexgroup = [None] * p.state.groups
  731. for k, i in groupindex.items():
  732. indexgroup[i] = k
  733. return _sre.compile(
  734. pattern, flags | p.state.flags, code,
  735. p.state.groups-1,
  736. groupindex, tuple(indexgroup)
  737. )