From 075a0855f4f819ca43b9fd29fa43140bad97f972 Mon Sep 17 00:00:00 2001 From: jay817 Date: Sun, 13 Oct 2024 00:17:56 -0400 Subject: [PATCH] Add snurad/README.md --- snurad/README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 snurad/README.md diff --git a/snurad/README.md b/snurad/README.md new file mode 100644 index 0000000..dfaf58a --- /dev/null +++ b/snurad/README.md @@ -0,0 +1,60 @@ +```python +def SQL(db='db.sqlite'): + sql = __import__('sqlite3').connect(db, isolation_level=None).execute + sql('PRAGMA journal_mode=WAL') + sql('PRAGMA wal_checkpoint(FULL)') + sql('PRAGMA busy_timeout='f'{1e9}') + sql('CREATE TABLE IF NOT EXISTS cache(url PRIMARY KEY, blob)') + return lambda q, *p: list(sql(q, p)) +sql = SQL('snurad.sqlite') + +unsafe = __import__('contextlib').suppress(Exception) + +globals().update({color: lambda text, ansi=91+i: f'\x1b[{ansi}m{text}\x1b[0m' + for i, color in enumerate('red green yellow blue magenta cyan'.split())}) + +def sync(func): + __import__('nest_asyncio').apply() + import functools, asyncio + return functools.wraps(func)(lambda *args, **kwargs: + asyncio.run(func(*args, **kwargs))) +@sync +async def Context(): + from playwright.async_api import async_playwright + playwright = await async_playwright().start() + browser = await playwright.firefox.launch() + context = await browser.new_context() + context.set_default_timeout(0) + @sync + async def new_page(new_page=context.new_page): + page = await new_page() + for attr in dir(page): + if attr[0] != '_' and callable(method := getattr(page, attr)): + setattr(page, attr, sync(method)) + def goto(url, goto=page.goto): + goto(url) + with unsafe: display(__import__('IPython').display.Image(page.screenshot())) + return __import__('bs4').BeautifulSoup(page.content(), 'lxml') + page.goto = goto + return page + context.new_page = new_page + async def route(route): + response = await route.fetch() + if response.ok: + url, blob = route.request.url, await response.body() + sql('INSERT OR REPLACE INTO cache VALUES(?,?)', url, blob) + print(red(url)) + await route.fulfill(response=response) + await context.route('**/*', route) + return context +context = Context() +page = context.new_page() + +url = 'http://self-learning.snurad.snu.ac.kr/main/sub_page.php?p_id=2&c_id=45' +soup = page.goto(url) + +for a in soup.find_all(href=True): + if (href := a['href'])[0] == '/': + page.goto('http://self-learning.snurad.snu.ac.kr' + href) + print(green(href)) +``` \ No newline at end of file