Files
wiki/playwright/README.md
2024-09-24 07:57:45 -04:00

105 lines
3.5 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
```python
"""
pip install playwright bs4
playwright install --with-deps
"""
# %%
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from textwrap import wrap
import re
async def summarize_page(page):
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
print(f"\n{'=' * 50}\n{soup.title.string or 'No title'}\n{'=' * 50}\n")
main_content = soup.body
if main_content:
texts = main_content.find_all(string=True)
def clean_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\.{2,}', '.', text)
return text.strip()
visible_texts = [clean_text(t) for t in texts
if t.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']]
visible_texts = [t for t in visible_texts if t]
if visible_texts:
print(f"{visible_texts.pop(0)}\n")
summary = ' '.join(visible_texts)
summary = re.sub(r'\s*\.\s*', '. ', summary)
summary = ' '.join(summary.split()[:100])
print('\n'.join(wrap(summary, width=80)))
print("\n" + "-" * 50 + "\n")
seen = set()
for selector in ['input', 'button', 'textarea', 'select']:
elements = await page.query_selector_all(selector)
for element in elements:
if await element.is_visible():
async def get_element_info(element):
props = ['id', 'name', 'type', 'value', 'placeholder', 'aria-label', 'role']
info = {}
for prop in props:
value = await element.get_attribute(prop)
if value:
info[prop] = value
tag_name = await element.evaluate('el => el.tagName.toLowerCase()')
info['tag'] = tag_name
return info
element_info = await get_element_info(element)
tag = element_info.pop('tag', 'unknown')
attrs = ' '.join([f'{k}="{v}"' for k, v in element_info.items()])
element_str = f"<{tag} {attrs}>"
if element_str not in seen:
print(element_str)
seen.add(element_str)
print("\n" + "-" * 50 + "\n")
async def main():
async with async_playwright() as p:
browser = await p.firefox.launch()
page = await browser.new_page()
await page.goto("https://google.com")
await summarize_page(page)
await browser.close()
# %%
await main()
```
## Output
```html
==================================================
Google
==================================================
Google 정보
스토어 Gmail 이미지 로그인 무엇에 관한 의견인지 선택하세요. 더보기 삭제 삭제 부적절한 예상 검색어 신고 Google 지원 언어:
English 대한민국 광고 비즈니스 검색의 원리 개인정보처리방침 약관 설정 검색 설정 고급검색 Google 검색에 표시되는 데이터 검색 기록
검색 도움말 의견 보내기 어두운 테마: 사용 안함 Google 앱
--------------------------------------------------
<input name="btnK" type="submit" value="Google 검색" aria-label="Google 검색" role="button">
<input name="btnI" type="submit" value="Im Feeling Lucky" aria-label="Im Feeling Lucky">
<textarea id="APjFqb" name="q" aria-label="검색" role="combobox">
--------------------------------------------------
```