본문 바로가기
인공지능/PYTHON

크롤링 (2)

by bibibig_data 2021. 6. 23.

1. 프렌즈 드라마 분석

# 드라마 텍스트 가공  (Do it 파이썬 생활프로그래밍) 초판 2020: 52페이지
# friends.txt를 다운받아서 같은 폴더에 넣은 다음 프로그램을 실행합니다. 
import os, re, codecs
#자신의 저장경로를 입력해야 합니다. 
os.chdir(r'D:/pythonclass')
f = open('friends101.txt', 'r', encoding = 'utf-8')
script101 = f.read()

# 문자열 객체 슬라이싱

print(script101[:100])

Line = re.findall(r'Monica:.+', script101)
#리스트 요소 중 앞에서 3개까지만 출력
print(Line[:3])
# 모니카의 대사만 모으기 
Line = re.findall(r'Monica:.+', script101)

# monica.txt 파일 만들기
f = open('monica.txt', 'w', encoding = 'utf-8')
monica = ''
for i in Line:
	monica += i +'\n'
f.write(monica)
# 파일 닫기 
f.close()	

# 모니카의 대사만 출력해보기
for item in Line[:3]:
    print(item)

# 등장인물 이름 모으기 
print(re.findall(r'[A-Z][a-z]+: ', script101) )

# 중복 지우기
print(set(re.findall(r'[A-Z][a-z]+: ', script101)))

# 캐릭터 이름 한 줄로 출력하기
character = [x[:-2] for x in list(set(re.findall(r'[A-Z][a-z]+: ',script101)))]
character

# 캐릭터 이름 각각 출력하기 
for i in character:
    print(i)

# 지문만 출력하기
re.findall(r'\([A-Za-z].+[a-z|\.]\)', script101, re.VERBOSE) [:6]
f = open('friends101.txt','r')
sentences = f.readlines()
# 20개만 출력해보기
for i in sentences[:20]:			## 먼저 문장 20개만 가져와 실험해 보겠습니다
    if re.match(r'[A-Z][a-z]+:', i): 	## match 문으로 문장 맨 앞에서 패턴을 찾습니다 
        print(i)

# would가 나오는 문장만 추출하기 
would = [ i for i in sentences if re.match(r'[A-Z][a-z]+:', i) and re.search('would', i)]
# take가 들어간 문장만 추출하기 
take = [ i for i in sentences if re.match(r'[A-Z][a-z]+:', i) and re.search(' take',i)]
# take가 들어간 문장 출력하기
for i in take:
	print(i)
 
# would가 들어간 문장 파일로 만들기 
newf = open('would.txt','w')	
newf.writelines(would)		

 


 

2. 크롤링 실습 2

http://quotes.toscrape.com/

import requests
from bs4 import BeautifulSoup
from pprint import pprint as pp

resp = requests.get('http://quotes.toscrape.com/')
resp.headers['Server']
pp(resp.text)

soup = BeautifulSoup(resp.text, 'html.parser')  # 적절한 parser 설정

type(soup)
help(soup)
dir(soup)

#
# Navigating the tree
#

soup
soup.name

type(soup.title)
help(soup.title)
dir(soup.title)

soup.title
soup.title.string
soup.title.text

type(soup.title.string)
help(soup.title.string)
dir(soup.title.string)
type(soup.title.text)

type(soup.title.parent.name)
soup.title.parent.name

soup.div              # 처음 만나는 div 태그
soup.div['class']     # class 속성의 값
soup.div.span.string  # NavigableString 타입
soup.div.span.text    # str 타입

soup.span
soup.span.text

soup.find('div')
soup.find('div').find('span')
soup.find('div').find('span').text
# soup.select_one('div')
# soup.select_one('div > span')
# soup.select_one('div > span').text

soup.find('span')
soup.find('span').text
# soup.select_one('span')
# soup.select_one('span').text

soup.find_all('div', class_='quote')
# soup.select('div.quote')

ret = soup.find_all('div', class_='quote')

# type(ret)
# len(ret)
# 
# ret[0]
# 
# for i in range(len(ret)):
#     pp(ret[i])
# 
# ret[0].find_all('span')
# ret[0].find_all('span')[0]
# ret[0].find_all('span')[1]
# ret[0].find_all('span')[1].find('small')
# ret[0].find_all('span')[1].find('small').text
# 
# ret[0].find_all('a')
# ret[0].find_all('a')[0]
# ret[0].find_all('a')[0]['href']
# 
# for i in range(len(ret)):
#     pp(ret[i].find_all('span')[0].text)
#     pp(ret[i].find_all('span')[1].find('small').text)
#     pp('http://quotes.toscrape.com/' + ret[i].find_all('a')[0]['href'])

list_ = []

for i in range(len(ret)):
    inner_list = []
    quote_ = (ret[i].find_all('span')[0].text)
    by_ = (ret[i].find_all('span')[1].find('small').text)
    link_ = 'http://quotes.toscrape.com/' + ret[i].find_all('a')[0]['href']
    inner_list.append(quote_)
    inner_list.append(by_)
    inner_list.append(link_)
    list_.append(inner_list)

list_

 


 

3. 크롤링 실습 3

https://book.coalastudy.com/data-crawling/week-1

 

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

 

크롤링 할 자료

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Coala Datascrap W1</title>
    <style>
        .main-title {
            font-size: 30px;
            font-weight: 600;
        }
        .items-row {
            width: 1100px;
            margin-top: 30px;
            clear: left;
            overflow: hidden;
        }
        img {
            border: 2px solid rgb(236, 236, 236);
        }
        .item {
            float: left;
            margin-right: 20px;
        }
        .metadata {
            padding-left: 5px;
        }
        .price {
            margin: 8px 0;
            color:rgb(226, 43, 89);
            font-weight: 600;
        }
        span {
            display: block;
        }
        a {
            text-decoration: none;
        }
        a > span {
            color: white;
            background-color: grey;
            display: inline-block;
            margin: 5px 0 0 5px;
            padding: 5px;
            font-size: 14px;
        }
    </style>
</head>
<body>
    <span class="main-title">제품 리스트</span>
    <div id="items-section">
        <div class="items-row">
            <div class="item">
                <img src="./notebook1.png">
                <div class="metadata">
                    <span class="title">14U380-EU1TK</span>
                    <span class="price">최저 389,000원</span>
                    <span class="comments">상품평 599</span>
                </div>
                <a href="#">
                    <span>판매처 72</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook2.png">                
                <div class="metadata">
                    <span class="title">14ZD980-GX30K</span>
                    <span class="price">최저 1,048,990원</span>
                    <span class="comments">상품평 419</span>
                </div>
                <a href="#">
                    <span>판매처 175</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook3.png">                
                <div class="metadata">
                    <span class="title">15ZD980-GX50K</span>
                    <span class="price">최저 1,383,840원</span>
                    <span class="comments">상품평 3,295</span>
                </div>
                <a href="#">
                    <span>판매처 300</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook4.png">                
                <div class="metadata">
                    <span class="title">10T370-L860K</span>
                    <span class="price">최저 289,000원</span>
                    <span class="comments">상품평 1,347</span>
                </div>
                <a href="#">
                    <span>판매처 50</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook5.png">                
                <div class="metadata">
                    <span class="title">NT950QAA-X58A</span>
                    <span class="price">최저 1,649,000원</span>
                    <span class="comments">상품평 673</span>
                </div>
                <a href="#">
                    <span>판매처 114</span>
                </a>
            </div>
        </div>
        <div class="items-row">
            <div class="item">
                <img src="./notebook6.png">                
                <div class="metadata">
                    <span class="title">14ZD980-GX50K</span>
                    <span class="price">최저 1,269,000원</span>
                    <span class="comments">상품평 1,797</span>
                </div>
                <a href="#">
                    <span>판매처 116</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook7.png">                
                <div class="metadata">
                    <span class="title">NT500R3W-KD1S</span>
                    <span class="price">최저 439,000원</span>
                    <span class="comments">상품평 820</span>
                </div>
                <a href="#">
                    <span>판매처 82</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook8.png">                
                <div class="metadata">
                    <span class="title">NT500R5W-LD31A</span>
                    <span class="price">최저 628,000원</span>
                    <span>상품평 1,065</span>
                </div>
                <a href="#">
                    <span class="sellers">판매처 132</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook9.png">                
                <div class="metadata">
                    <span class="title">D125</span>
                    <span class="price">최저 299,000원</span>
                    <span>상품평 345</span>
                </div>
                <a href="#">
                    <span class="sellers">판매처 9</span>
                </a>
            </div>
            <div class="item">
                <img src="./notebook10.png">                
                <div class="metadata">
                    <span class="title">15U480-LR1WK</span>
                    <span class="price">최저 639,000원</span>
                    <span class="comments">상품평 378</span>
                </div>
                <a href="#">
                    <span>판매처 62</span>
                </a>
            </div>
        </div>
    </div>
</body>
</html>

 

문제

from bs4 import BeautifulSoup
soup = BeautifulSoup(open(r"C:\dev\atomworkspace\shopping\shopping.html",'r',encoding="utf-8"), 'html.parser')

# 1. 품명에 해당하는 HTML 요소의 선택자를 찾아보세요.
soup.select('.title')

# 2. 물품 이미지에 해당하는 HTML 요소의 선택자를 찾아보세요.
soup.select('img')

# 3. 물품 HTML 요소의 자식들 중 상품평에 도달하는 선택자 경로를 만들어 보세요.
soup.select('div.metadata > span.comments')

# 4. 전체 물품 리스트 HTML 요소의 자손들 중 품명, 가격, 상품평 등 모든 정보를 
#    하나하나 가져올 수 있도록 선택자 경로를 만들어 보세요.
soup.select("#items-section span")

# 물품 이미지에 해당하는 HTML 요소
# 물품 정보 HTML 요소의 자식들 중 상품평에 해당하는 HTML요소
# 전체 물품 리스트 HTML 요소의 자손들 중 품명, 가격, 상품평 등의 데이터를 

 

 


 

예제 5 ) 

How to add CSS (w3schools.com)

참고자료

# https://www.w3schools.com/cssref/css_selectors.asp

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<a href="http://www.naver.com" class="portal" id="naver">Elsie</a>,
  
<p class="story">Once upon a time there were three little sisters; and their names were
  <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  and they lived at the bottom of a well.
</p>

<p class="story sister">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

soup.select('.title')
soup.select('.story.sister')
soup.select('.story .sister')
soup.select('#link2')
soup.select('*')
soup.select('p')
soup.select('p.story')
soup.select('title, a')
soup.select('body a')
soup.select('body > a')


html_doc = """
<!DOCTYPE html>
<html>
<head>
</head>
<body>

<h1>Welcome to My Homepage</h1>

<div>
  <h2>My name is Donald</h2>
  <p>I live in Duckburg.</p>
</div>

<p nemam='stmt'>My best friend is Mickey.</p>
<p nemam='stmt'>I will not be styled.</p>

<a href="https://www.w3schools.com" target="_blank">w3schools.com</a>

<div>
  <p lang="en">Hello!</p>
  <p lang="enus">Hi!</p>
  <p lang="en-gb">Ello!</p>
  <p lang="us">Hi!</p>
  <p lang="no">Hei!</p>
</div>
 
<p>비켜주세요</p>
  
</body>
</html>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

soup.select("div + p")
soup.select("div ~ p")
soup.select('a[target]')
soup.select('a[target="_blank"]')
soup.select('[nemam~=stmt]')
soup.select('[lang|=en]')
soup.select('[lang^=en]')
soup.select('[lang$=us]')
soup.select('[lang*=us]')

soup.select('p:nth-of-type(2)')

 


---------------------------------
> 네이버 실시간 검색어 크롤링   <
---------------------------------

* 다음 실시간 검색어 서비스는 2020년 2월 서비스 중지됨

* 변경된 네이버 실검 방식에 맞는 크롤링
  - 네이버 JSON 주소 찾기 : https://wikidocs.net/68005

 

import requests
from bs4 import BeautifulSoup

json = requests.get('https://www.naver.com/srchrank?frm=main').json()
json

# json 데이터에서 "data" 항목의 값을 추출
ranks = json.get("data")
ranks

# 해당 값은 리스트 형태로 제공되기에 리스트만큼 반복
for r in ranks:
    # 각 데이터는 rank, keyword, keyword_synomyms
    rank = r.get("rank")
    keyword = r.get("keyword")
    print(rank, keyword)
    

 

 


 

---------------------
> 네이트판 크롤링   <
---------------------

 

import requests
from bs4 import BeautifulSoup

resp = requests.get('https://pann.nate.com/')
soup = BeautifulSoup(resp.text, 'html.parser')

# result = soup.select("#container > div.content.main > div.post-wrap > div.bestTalkBox > div:nth-child(2) > ol")
result = soup.select('div.bestTalkBox > div:nth-child(2) > ol')
                       
type(result)

for i in result:
    print(i.text)

생활프로그래밍 282p : py파일을 변환시키기

'인공지능 > PYTHON' 카테고리의 다른 글

셀레늄  (0) 2021.06.25
Crawling and Flask  (0) 2021.06.24
Flask 2  (0) 2021.06.22
Flask  (0) 2021.06.22
Python 300제 - 함수  (0) 2021.06.18