From cc38b6e59d16832fec21cfecfa2a4b2069e96428 Mon Sep 17 00:00:00 2001 From: jay817 Date: Tue, 19 Nov 2024 08:09:20 -0500 Subject: [PATCH] Update playwright/README.md --- playwright/README.md | 117 +++++++------------------------------------ 1 file changed, 17 insertions(+), 100 deletions(-) diff --git a/playwright/README.md b/playwright/README.md index bebe9b1..ed11b2f 100644 --- a/playwright/README.md +++ b/playwright/README.md @@ -1,105 +1,22 @@ ```python """ -pip install playwright bs4 +WSL Debian + +pip install playwright nest_asyncio playwright install --with-deps """ # %% -from playwright.async_api import async_playwright -from bs4 import BeautifulSoup -from textwrap import wrap -import re - -async def summarize_page(page): - content = await page.content() - soup = BeautifulSoup(content, 'html.parser') - - print(f"\n{'=' * 50}\n{soup.title.string or 'No title'}\n{'=' * 50}\n") - - main_content = soup.body - if main_content: - texts = main_content.find_all(string=True) - - def clean_text(text): - text = re.sub(r'\s+', ' ', text) - text = re.sub(r'\.{2,}', '.', text) - return text.strip() - - visible_texts = [clean_text(t) for t in texts - if t.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']] - visible_texts = [t for t in visible_texts if t] - - if visible_texts: - print(f"{visible_texts.pop(0)}\n") - - summary = ' '.join(visible_texts) - summary = re.sub(r'\s*\.\s*', '. ', summary) - summary = ' '.join(summary.split()[:100]) - print('\n'.join(wrap(summary, width=80))) - - print("\n" + "-" * 50 + "\n") - - seen = set() - for selector in ['input', 'button', 'textarea', 'select']: - elements = await page.query_selector_all(selector) - for element in elements: - if await element.is_visible(): - async def get_element_info(element): - props = ['id', 'name', 'type', 'value', 'placeholder', 'aria-label', 'role'] - info = {} - for prop in props: - value = await element.get_attribute(prop) - if value: - info[prop] = value - - tag_name = await element.evaluate('el => el.tagName.toLowerCase()') - info['tag'] = tag_name - - return info - - element_info = await get_element_info(element) - tag = element_info.pop('tag', 'unknown') - - attrs = ' '.join([f'{k}="{v}"' for k, v in element_info.items()]) - element_str = f"<{tag} {attrs}>" - - if element_str not in seen: - print(element_str) - seen.add(element_str) - - print("\n" + "-" * 50 + "\n") - -async def main(): - async with async_playwright() as p: - browser = await p.firefox.launch() - page = await browser.new_page() - await page.goto("https://google.com") - - await summarize_page(page) - - await browser.close() - -# %% -await main() -``` - -## Output -```html -================================================== -Google -================================================== - -Google 정보 - -스토어 Gmail 이미지 로그인 무엇에 관한 의견인지 선택하세요. 더보기 삭제 삭제 부적절한 예상 검색어 신고 Google 지원 언어: -English 대한민국 광고 비즈니스 검색의 원리 개인정보처리방침 약관 설정 검색 설정 고급검색 Google 검색에 표시되는 데이터 검색 기록 -검색 도움말 의견 보내기 어두운 테마: 사용 안함 Google 앱 - --------------------------------------------------- - - - -