PITÃO
Abaixo está um gerador de regex auto-correspondente. Você fornece duas listas, uma contém dados de treinamento que a regex deve corresponder (além de corresponder a si mesma), a outra contém dados de treinamento que a regex NÃO deve corresponder:
from random import choice, randrange
import re
from itertools import zip_longest, chain, islice
from operator import itemgetter
CHAR_SET = [chr(i) for i in range(128)] + [r"\\", r"\d", r"\D",
r"\w", r"\W", r"\s",
r"\S", r"?:", r"\1",
r"\2", r"\A", r"\b",
r"\B", r"\Z", r"\.",
r"\[", r"\]", r"\(",
r"\)", r"\{", r"\}",
r"\+", r"\|", r"\?",
r"\*"]
CHAR_SAMPLE = []
BREAKPOINT = re.compile(
r"""
\(.*?\)|
\[.*?\]|
\{.*?\}|
\w+(?=[\(\[\{])?|
\S+?|
\.\*\??|
\.\+\??|
\.\?\??|
\\.|
.*?
""",
re.VERBOSE)
MATCH_BRACKETS = {'(': ')', '[': ']', '{': '}'}
CLOSE_BRACKETS = {')', ']', '}'}
REGEX_SEEDER = [
r".*?",
r"(?:.*?)",
r"\w|\s",
r"(?<.*?)",
r"(?=.*?)",
r"(?!.*?)",
r"(?<=.*?)",
r"(?<!.*?)",
]
LEN_LIMIT = 100
def distribute(distribution):
global CHAR_SAMPLE
for item in CHAR_SET:
if item in distribution:
CHAR_SAMPLE.extend([item] * distribution[item])
else:
CHAR_SAMPLE.append(item)
def rand_index(seq, stop=None):
if stop is None:
stop = len(seq)
try:
return randrange(0, stop)
except ValueError:
return 0
def rand_slice(seq):
try:
start = randrange(0, len(seq))
stop = randrange(start, len(seq))
return slice(start, stop)
except ValueError:
return slice(0, 0)
#Mutation Functions
def replace(seq):
seq[rand_index(seq)] = choice(CHAR_SAMPLE)
def delete(seq):
del seq[rand_index(seq)]
def insert(seq):
seq.insert(rand_index(seq, len(seq) + 1), choice(CHAR_SAMPLE))
def duplicate(seq):
source = rand_slice(seq)
seq[source.stop: source.stop] = seq[source]
def swap(seq):
if len(seq) < 2: return
a = rand_index(seq, len(seq) - 1)
seq[a], seq[a + 1] = seq[a + 1], seq[a]
dummy = lambda seq: None
MUTATE = (
replace,
delete,
insert,
duplicate,
swap,
dummy,
dummy,
)
def repair_brackets(seq):
"""Attempts to lower the percentage of invalid regexes by
matching orphaned brackets"""
p_stack, new_seq = [], []
for item in seq:
if item in MATCH_BRACKETS:
p_stack.append(item)
elif item in CLOSE_BRACKETS:
while p_stack and MATCH_BRACKETS[p_stack[-1]] != item:
new_seq.append(MATCH_BRACKETS[p_stack[-1]])
p_stack.pop()
if not p_stack:
continue
else:
p_stack.pop()
new_seq.append(item)
while p_stack:
new_seq.append(MATCH_BRACKETS[p_stack.pop()])
return new_seq
def compress(seq):
new_seq = [seq[0]]
last_match = seq[0]
repeat = 1
for item in islice(seq, 1, len(seq)):
if item == last_match:
repeat += 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
new_seq.append(item)
last_match = item
repeat = 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
return new_seq
def mutate(seq):
"""Random in-place mutation of sequence"""
if len(seq) > LEN_LIMIT:
seq[:] = seq[:LEN_LIMIT]
c = choice(MUTATE)
c(seq)
def crossover(seqA, seqB):
"""Recombination of two sequences at optimal breakpoints
along each regex strand"""
bpA = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
bpB = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
slObjA = (slice(*item) for item in zip(bpA, bpA[1:]))
slObjB = (slice(*item) for item in zip(bpB, bpB[1:]))
slices = zip_longest(
(seqA[item] for item in slObjA),
(seqB[item] for item in slObjB),
fillvalue=[]
)
recombinant = (choice(item) for item in slices)
return list(chain.from_iterable(recombinant))
#Fitness testing
def match_percentage(match):
"""Calculates the percentage a text actually matched
by a regular expression"""
if match and match.endpos:
return (match.end() - match.start()) / match.endpos
else:
return 0.001
def fitness_test(seq, pos_matches, neg_matches):
"""Scoring algorithm to determine regex fitness"""
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
seq[:] = repair_brackets(seq)
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
return 0.001
pos_score = sum(match_percentage(regex.search(item))
for item in pos_matches) / len(pos_matches) / 3
neg_score = (1 - sum(match_percentage(regex.search(item))
for item in neg_matches) / len(neg_matches)) / 3
self_score = match_percentage(regex.search(self_str)) / 3
return pos_score + self_score + neg_score
#Population Management
def generate_pop(pos_matches, neg_matches, pop_size):
sources = (pos_matches, REGEX_SEEDER)
return [crossover(
choice(choice(sources)), choice(choice(sources))
) for i in range(pop_size)]
def glean_pop(population, cutoff, fit_test, ft_args=()):
scores = (fit_test(bug, *ft_args) for bug in population)
ranked = sorted(zip(population, scores), key=itemgetter(1), reverse=True)
maxItem = ranked[0]
new_pop = next(zip(*ranked))[:cutoff]
return maxItem, new_pop
def repopulate(population, pop_size):
cutoff = len(population)
for i in range(pop_size // cutoff):
population.extend([crossover(choice(population), choice(population))
for i in range(cutoff)])
population.extend([population[i][:] for i in range(pop_size - len(population))])
#Simulator
def simulate(pos_matches, neg_matches, pop_size=50, cutoff=10, threshold=1.0):
population = generate_pop(pos_matches, neg_matches, pop_size)
while True:
for bug in population:
mutate(bug)
#Scoring step
max_item, population = glean_pop(
population,
cutoff,
fitness_test,
(pos_matches, neg_matches)
)
#Exit condition:
max_regex, max_score = max_item
if max_score >= threshold:
return max_score, max_regex
"""
print(max_score, ''.join(max_regex))
input("next?")"""
#Repopulation Step:
population = list(population)
repopulate(population, pop_size)