From bfcf70a015a015a0574e5ba49c91fb3f1f343578 Mon Sep 17 00:00:00 2001 From: Jaewook Lee <11328376+jaewooklee93@users.noreply.github.com> Date: Mon, 15 Jul 2024 19:09:10 +0900 Subject: [PATCH] new auto-regex example code --- auto-regex/main.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 auto-regex/main.py diff --git a/auto-regex/main.py b/auto-regex/main.py new file mode 100644 index 0000000..e348e63 --- /dev/null +++ b/auto-regex/main.py @@ -0,0 +1,41 @@ +import requests, re +import polars as pl +from bs4 import BeautifulSoup as Soup + + +def auto_regex(items): + tokenizer = re.compile(r' \n | <[^\s>]* | > | ]*> | \s[^=]*= | "[^"]*" | [^<>]+ ', re.VERBOSE) + pattern = '^' + for tok in tokenizer.findall(items[len(items)//2]): + # print(tok) + tok = re.escape(tok) + for candidate in [f'{pattern}{tok}', f'{pattern}(.*){tok}', f'{pattern}(.*)']: + if all(re.match(candidate, item) for item in items): + if not candidate.endswith('(.*)(.*)'): + pattern = candidate + break + return pattern[1:] + +def auto_scrape(url): + response = requests.get(url, headers= {'User-Agent': ''}) + soup = Soup(response.text, 'html.parser') + for tag in soup.find_all(): + children = [child.name for child in tag.contents if child.name] + if len(set(children)) == 1 and len(children) > 10 and tag.text.strip(): + children = [str(child) for child in tag.contents if child.name] + pattern = re.compile(auto_regex(children)) + df = [pattern.match(child).groups() for child in children] + df = pl.DataFrame(df, orient='row') + texts = [] + urls = [] + + for c in df.columns: + if sample := df[len(df)//2][c].item(): + if sample[0] == ' ': continue + elif sample[:2] in ['"h', '