Files
wiki/playwright/README.md
2024-09-24 07:57:45 -04:00

3.5 KiB
Raw Blame History

"""
pip install playwright bs4
playwright install --with-deps
"""
# %%
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from textwrap import wrap
import re

async def summarize_page(page):
    content = await page.content()
    soup = BeautifulSoup(content, 'html.parser')

    print(f"\n{'=' * 50}\n{soup.title.string or 'No title'}\n{'=' * 50}\n")

    main_content = soup.body
    if main_content:
        texts = main_content.find_all(string=True)

        def clean_text(text):
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'\.{2,}', '.', text)
            return text.strip()

        visible_texts = [clean_text(t) for t in texts 
                         if t.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']]
        visible_texts = [t for t in visible_texts if t]

        if visible_texts:
            print(f"{visible_texts.pop(0)}\n")

        summary = ' '.join(visible_texts)
        summary = re.sub(r'\s*\.\s*', '. ', summary)
        summary = ' '.join(summary.split()[:100])
        print('\n'.join(wrap(summary, width=80)))

    print("\n" + "-" * 50 + "\n")

    seen = set()
    for selector in ['input', 'button', 'textarea', 'select']:
        elements = await page.query_selector_all(selector)
        for element in elements:
            if await element.is_visible():
                async def get_element_info(element):
                    props = ['id', 'name', 'type', 'value', 'placeholder', 'aria-label', 'role']
                    info = {}
                    for prop in props:
                        value = await element.get_attribute(prop)
                        if value:
                            info[prop] = value

                    tag_name = await element.evaluate('el => el.tagName.toLowerCase()')
                    info['tag'] = tag_name

                    return info

                element_info = await get_element_info(element)
                tag = element_info.pop('tag', 'unknown')

                attrs = ' '.join([f'{k}="{v}"' for k, v in element_info.items()])
                element_str = f"<{tag} {attrs}>"

                if element_str not in seen:
                    print(element_str)
                    seen.add(element_str)

    print("\n" + "-" * 50 + "\n")

async def main():
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto("https://google.com")

        await summarize_page(page)

        await browser.close()

# %%
await main()

Output

==================================================
Google
==================================================

Google 정보

스토어 Gmail 이미지 로그인 무엇에 관한 의견인지 선택하세요. 더보기 삭제 삭제 부적절한 예상 검색어 신고 Google 지원 언어:
English 대한민국 광고 비즈니스 검색의 원리 개인정보처리방침 약관 설정 검색 설정 고급검색 Google 검색에 표시되는 데이터 검색 기록
검색 도움말 의견 보내기 어두운 테마: 사용 안함 Google 앱

--------------------------------------------------

<input name="btnK" type="submit" value="Google 검색" aria-label="Google 검색" role="button">
<input name="btnI" type="submit" value="Im Feeling Lucky" aria-label="Im Feeling Lucky">
<textarea id="APjFqb" name="q" aria-label="검색" role="combobox">

--------------------------------------------------