From 186580bc4badb606ae0a2f25aae4cb26e6cc5d0f Mon Sep 17 00:00:00 2001 From: jay817 Date: Tue, 24 Sep 2024 07:57:16 -0400 Subject: [PATCH] Update playwright/README.md --- playwright/README.md | 220 ++++++++++++++++--------------------------- 1 file changed, 82 insertions(+), 138 deletions(-) diff --git a/playwright/README.md b/playwright/README.md index 00ede7c..e58b3b8 100644 --- a/playwright/README.md +++ b/playwright/README.md @@ -1,157 +1,101 @@ ```python -# google.py +# %% +from playwright.async_api import async_playwright +from bs4 import BeautifulSoup +from textwrap import wrap +import re -#%% -import xvfbwrapper, playwright.async_api -from PIL import Image; from io import * -#%% -xvfbwrapper.Xvfb().start() -playwright = await playwright.async_api.async_playwright().start() -browser = await playwright.chromium.launch(headless=False, - args=['--enable-features=WebContentsForceDark']) -page = await browser.new_page() -#%% -await page.goto('https://google.com') -image = Image.open(BytesIO(await page.screenshot())) -image.save('google.png') -``` +async def summarize_page(page): + content = await page.content() + soup = BeautifulSoup(content, 'html.parser') -위의 `google.py`는 top-level await를 사용하므로 아래와 같이 실행해야 한다. -```sh -python -m asyncio < google.py -``` + print(f"\n{'=' * 50}\n{soup.title.string or 'No title'}\n{'=' * 50}\n") -```python -# play.py -from playwright.async_api import async_playwright as aP -import os, io, asyncio, xvfbwrapper -from db import DB + main_content = soup.body + if main_content: + texts = main_content.find_all(string=True) -async def Page(browser='chromium', headless=True): - if headless: xvfbwrapper.Xvfb().start() - else: os.environ['DISPLAY'] = ':0' + def clean_text(text): + text = re.sub(r'\s+', ' ', text) + text = re.sub(r'\.{2,}', '.', text) + return text.strip() - db = DB() - playwright = await aP().start() - browser = await getattr(playwright, browser).launch(headless=False) - context = await browser.new_context(accept_downloads=True) - context.set_default_timeout(0) + visible_texts = [clean_text(t) for t in texts + if t.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']] + visible_texts = [t for t in visible_texts if t] - async def save(response): - try: - if response.ok and not db.exists(url := response.url): - db[url] = await response.body() - except: pass - - async def load(route): - if body := db[route.request.url]: return await route.fulfill(body=body) - await route.continue_() + if visible_texts: + print(f"{visible_texts.pop(0)}\n") - context.on('response', save) - await context.route('**/*', load) - for block in ['**/*.gif', '**/css*.js']: - await context.route(block, lambda route: route.abort()) - - return await context.new_page() + summary = ' '.join(visible_texts) + summary = re.sub(r'\s*\.\s*', '. ', summary) + summary = ' '.join(summary.split()[:100]) + print('\n'.join(wrap(summary, width=80))) + + print("\n" + "-" * 50 + "\n") + + seen = set() + for selector in ['input', 'button', 'textarea', 'select']: + elements = await page.query_selector_all(selector) + for element in elements: + if await element.is_visible(): + async def get_element_info(element): + props = ['id', 'name', 'type', 'value', 'placeholder', 'aria-label', 'role'] + info = {} + for prop in props: + value = await element.get_attribute(prop) + if value: + info[prop] = value + + tag_name = await element.evaluate('el => el.tagName.toLowerCase()') + info['tag'] = tag_name + + return info + + element_info = await get_element_info(element) + tag = element_info.pop('tag', 'unknown') + + attrs = ' '.join([f'{k}="{v}"' for k, v in element_info.items()]) + element_str = f"<{tag} {attrs}>" + + if element_str not in seen: + print(element_str) + seen.add(element_str) + + print("\n" + "-" * 50 + "\n") async def main(): - page = await Page() - await page.goto('https://reddit.com') - await page.screenshot(path='screenshot.png') + async with async_playwright() as p: + browser = await p.firefox.launch() + page = await browser.new_page() + await page.goto("https://google.com") -if __name__ == "__main__": - asyncio.run(main()) + await summarize_page(page) + + await browser.close() + +# %% +await main() ``` -```python -# db.py -import sqlite3, json, os +## Output +```html +================================================== +Google +================================================== -class DB(sqlite3.Connection): - def __init__(self, db_name=".db.sqlite"): - super().__init__(os.path.expanduser(db_name)) - with self: - self.execute(''' - CREATE TABLE IF NOT EXISTS kv_store - (key TEXT PRIMARY KEY, value BLOB) - ''') +Google 정보 - def __setitem__(self, key, value): - value = value if isinstance(value, bytes) else json.dumps(value) - with self: - result = self.execute(''' - INSERT OR REPLACE INTO kv_store - (key, value) VALUES (?, ?) - ''', (key, value)).rowcount - return {"modified_count": result} +스토어 Gmail 이미지 로그인 무엇에 관한 의견인지 선택하세요. 더보기 삭제 삭제 부적절한 예상 검색어 신고 Google 지원 언어: +English 대한민국 광고 비즈니스 검색의 원리 개인정보처리방침 약관 설정 검색 설정 고급검색 Google 검색에 표시되는 데이터 검색 기록 +검색 도움말 의견 보내기 어두운 테마: 사용 안함 Google 앱 - def __getitem__(self, key): - with self: - result = self.execute(''' - SELECT value FROM kv_store - WHERE key = ? - ''', (key,)).fetchone() - if result: - if isinstance(value := result[0], str): - try: return json.loads(value) - except json.JSONDecodeError: pass - return value +-------------------------------------------------- - def delete(self, key): - with self: - result = self.execute(''' - DELETE FROM kv_store - WHERE key = ? - ''', (key,)).rowcount - return {"deleted_count": result} + + +