70 lines
2.9 KiB
Python
70 lines
2.9 KiB
Python
unsafe = __import__('contextlib').suppress(Exception)
|
|
Soup = lambda html: __import__('bs4').BeautifulSoup(html, 'lxml')
|
|
for i, c in enumerate('RGYBMC'): globals()[c] = lambda s, i=i: f'\x1b[{91+i}m{s}\x1b[0m'
|
|
|
|
def SQL():
|
|
import sqlite3, os, hashlib
|
|
(con := sqlite3.connect('.db', isolation_level=None)).row_factory = sqlite3.Row
|
|
con.execute('PRAGMA journal_mode=wal')
|
|
con.execute('PRAGMA busy_timeout='f'{1e9}')
|
|
con.execute('CREATE TABLE IF NOT EXISTS kv(k, v, t DEFAULT CURRENT_TIMESTAMP)')
|
|
os.makedirs('.db-blob', exist_ok=True)
|
|
def put(sql, filename, blob):
|
|
sha1 = hashlib.sha1(blob).hexdigest()
|
|
if not sql('SELECT 1 FROM kv WHERE v=?', sha1):
|
|
try:
|
|
with open(f'.db-blob/{sha1}', 'xb') as f: f.write(blob)
|
|
print(f'{G(len(blob)):>16} {filename}')
|
|
except FileExistsError: pass
|
|
sql[filename] = sha1
|
|
def get(sql, filename):
|
|
return open(f'.db-blob/{sql[filename]}', 'rb').read()
|
|
|
|
return type('', (), dict(put=put, get=get,
|
|
__call__=lambda _, q, *p: list(map(dict, con.execute(q, p))),
|
|
__setitem__=lambda sql, k, v: sql('INSERT INTO kv(k,v) VALUES(?,?)', k, v),
|
|
__getitem__ = lambda sql, k: sql(
|
|
'SELECT v FROM kv WHERE k=? ORDER BY t DESC LIMIT 1', k)[0]['v'],
|
|
__iter__=lambda sql: (kv.values() for kv in sql(
|
|
'SELECT k, v FROM kv GROUP BY k HAVING t = MAX(t)'))))()
|
|
sql = SQL()
|
|
|
|
def sync(coro):
|
|
import asyncio, functools
|
|
if not asyncio.iscoroutinefunction(coro): return coro
|
|
@functools.wraps(coro)
|
|
def wrapper(*args, **kwargs):
|
|
loop, future = asyncio.get_event_loop(), asyncio.ensure_future(coro(*args, **kwargs))
|
|
while not future.done():
|
|
loop._process_events(loop._selector.select(0))
|
|
if (ready := loop._ready) and (handle := ready.popleft())._cancelled is False:
|
|
task = (tasks := asyncio.tasks._current_tasks).pop(loop, None)
|
|
handle._run(); tasks[loop] = task
|
|
return future.result()
|
|
return wrapper
|
|
|
|
@sync
|
|
async def Page(headless=True):
|
|
from playwright.async_api import async_playwright
|
|
browser = await (await async_playwright().start()).firefox.launch(headless=headless)
|
|
(page := await browser.new_page()).set_default_timeout(0)
|
|
for attr in dir(page):
|
|
if callable(method := getattr(page, attr)): setattr(page, attr, sync(method))
|
|
async def handle(route):
|
|
with unsafe:
|
|
if route.request.method == 'GET' and (response := await route.fetch()).ok:
|
|
sql.put(route.request.url, await response.body())
|
|
await route.continue_()
|
|
page.route('**/*', handle)
|
|
|
|
def goto(url, goto=page.goto):
|
|
goto(url, wait_until='networkidle')
|
|
sql.put(url.split('://')[1], page.content().encode())
|
|
from IPython.display import Image
|
|
return Image(page.screenshot())
|
|
page.goto = goto
|
|
return page
|
|
page = Page()
|
|
|
|
page.goto('https://naver.com/')
|