Files
2024-10-13 00:17:56 -04:00
..
2024-10-13 00:17:56 -04:00

def SQL(db='db.sqlite'):
    sql = __import__('sqlite3').connect(db, isolation_level=None).execute
    sql('PRAGMA journal_mode=WAL')
    sql('PRAGMA wal_checkpoint(FULL)')
    sql('PRAGMA busy_timeout='f'{1e9}')
    sql('CREATE TABLE IF NOT EXISTS cache(url PRIMARY KEY, blob)')
    return lambda q, *p: list(sql(q, p))
sql = SQL('snurad.sqlite')

unsafe = __import__('contextlib').suppress(Exception)

globals().update({color: lambda text, ansi=91+i: f'\x1b[{ansi}m{text}\x1b[0m'
    for i, color in enumerate('red green yellow blue magenta cyan'.split())})

def sync(func):
    __import__('nest_asyncio').apply()
    import functools, asyncio
    return functools.wraps(func)(lambda *args, **kwargs:
        asyncio.run(func(*args, **kwargs)))
@sync
async def Context():
    from playwright.async_api import async_playwright
    playwright = await async_playwright().start()
    browser = await playwright.firefox.launch()
    context = await browser.new_context()
    context.set_default_timeout(0)
    @sync
    async def new_page(new_page=context.new_page):
        page = await new_page()
        for attr in dir(page):
            if attr[0] != '_' and callable(method := getattr(page, attr)):
                setattr(page, attr, sync(method))
        def goto(url, goto=page.goto):
            goto(url)
            with unsafe: display(__import__('IPython').display.Image(page.screenshot()))
            return __import__('bs4').BeautifulSoup(page.content(), 'lxml')
        page.goto = goto
        return page
    context.new_page = new_page
    async def route(route):
        response = await route.fetch()
        if response.ok:
            url, blob = route.request.url, await response.body()
            sql('INSERT OR REPLACE INTO cache VALUES(?,?)', url, blob)
            print(red(url))
        await route.fulfill(response=response)
    await context.route('**/*', route)
    return context
context = Context()
page = context.new_page()

url = 'http://self-learning.snurad.snu.ac.kr/main/sub_page.php?p_id=2&c_id=45'
soup = page.goto(url)

for a in soup.find_all(href=True):
    if (href := a['href'])[0] == '/':
        page.goto('http://self-learning.snurad.snu.ac.kr' + href)
        print(green(href))