def SQL(db='db.sqlite'):
sql = __import__('sqlite3').connect(db, isolation_level=None).execute
sql('PRAGMA journal_mode=WAL')
sql('PRAGMA wal_checkpoint(FULL)')
sql('PRAGMA busy_timeout='f'{1e9}')
sql('CREATE TABLE IF NOT EXISTS cache(url PRIMARY KEY, blob)')
return lambda q, *p: list(sql(q, p))
sql = SQL('snurad.sqlite')
unsafe = __import__('contextlib').suppress(Exception)
globals().update({color: lambda text, ansi=91+i: f'\x1b[{ansi}m{text}\x1b[0m'
for i, color in enumerate('red green yellow blue magenta cyan'.split())})
def sync(func):
__import__('nest_asyncio').apply()
import functools, asyncio
return functools.wraps(func)(lambda *args, **kwargs:
asyncio.run(func(*args, **kwargs)))
@sync
async def Context():
from playwright.async_api import async_playwright
playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context()
context.set_default_timeout(0)
@sync
async def new_page(new_page=context.new_page):
page = await new_page()
for attr in dir(page):
if attr[0] != '_' and callable(method := getattr(page, attr)):
setattr(page, attr, sync(method))
def goto(url, goto=page.goto):
goto(url)
with unsafe: display(__import__('IPython').display.Image(page.screenshot()))
return __import__('bs4').BeautifulSoup(page.content(), 'lxml')
page.goto = goto
return page
context.new_page = new_page
async def route(route):
response = await route.fetch()
if response.ok:
url, blob = route.request.url, await response.body()
sql('INSERT OR REPLACE INTO cache VALUES(?,?)', url, blob)
print(red(url))
await route.fulfill(response=response)
await context.route('**/*', route)
return context
context = Context()
page = context.new_page()
url = 'http://self-learning.snurad.snu.ac.kr/main/sub_page.php?p_id=2&c_id=45'
soup = page.goto(url)
for a in soup.find_all(href=True):
if (href := a['href'])[0] == '/':
page.goto('http://self-learning.snurad.snu.ac.kr' + href)
print(green(href))