webengine: Python BeautifulSoup



퍼블릭 xml 리소스들

https://www.data.go.kr/
http://www.weather.go.kr/weather/lifenindustry/sevice_rss.jsp




# xml -> beautifulsoup을 쓴다.# 설치 필요#
from bs4 import BeautifulSoup

fp = open("song.xml")
soup = BeautifulSoup(fp, "html.parser")
#print(type(soup), soup)
# soup은 xml의 root element를 나타냄.# . 으로 후손 접근 가능함. 직접 자식이 아니어도 됨. .find("후손명")도 동일 함.
print(soup.songlist.song) # OKprint(soup.song) # OKsng = soup.song
print(sng.title)
print(sng.title.text)

sngs = soup.findAll('song') # bs4.element.ResultSet 타입의 object를 갖는 리스트 임.print(type(sngs))
print(sngs[1].title.string)

for s in sngs:
    print(s['album'])

print(sngs[1].parent)

# sibling : findPrevious, findNext. Sibling은 부모가 달라고 레벨이 같은 친구들임.
print(sngs[1].findPrevious('song'))
print(sngs[1].findNext('song'))

soup.find('song', {'album':'BB'}) # 속성 검색

class A:
    def __init__(self):
        a = 0
a = A()
print(a)
# 어떻게 soup은 object인데 print하면 텍스트가 나오나 ?
# __repr__ 메소드를 구현해 주면 된다.

RSS 리더

from bs4 import BeautifulSoup
import urllib.request as REQ

jurl = "http://rss.joins.com/joins_news_list.xml"response = REQ.urlopen(jurl)

soup = BeautifulSoup(response, "html.parser")
#print(soup)
items = soup.findAll("item")

for i in items:
    print("기사제목:", i.title.string )
    print("기사내용:", i.description.string)
    print("-"*20)

HTML에서 일부 가져오기

from bs4 import BeautifulSoup
import urllib.request as REQ

nurl = "http://news.naver.com"response = REQ.urlopen(nurl)

soup = BeautifulSoup(response, "html.parser")
#print(soup)
items = soup.findAll("div", {'class':'newsnow_tx_inner'})
for i in items:
    print(i.strong.string)

추가하기

from bs4 import BeautifulSoup

fp = open("song.xml")
soup = BeautifulSoup(fp, "html.parser")

print(soup)

# 항목 추가 방법n_song   = soup.new_tag('song', album='Cheap Thrills')
n_title  = soup.new_tag('title')
n_title.string = 'Chandlier'n_singer = soup.new_tag('singer')
n_singer.string = 'Sia'
n_song.append(n_title)
n_song.append(n_singer)

soup.songlist.append(n_song)
print (soup)

# 변경된 내용을 저장하기 위해서는 문자여로 변환 필요.s = soup.prettify() # s는 object가 아니라 stringprint(s)

f = open("song.xml", "w")
f.write(s)
f.close()

삭제하기

from bs4 import BeautifulSoup

fp = open("song.xml")
soup = BeautifulSoup(fp, "html.parser")
print(soup)
soup.song.decompose()
print(soup)

기상청 날씨

from bs4 import BeautifulSoup
import urllib.request as REQ

nurl = "http://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp?stnId=109"response = REQ.urlopen(nurl)

soup = BeautifulSoup(response, "html.parser")
locations = soup.findAll("location")

def printCity(location):
    print(location.city.string)
    print("-"*10)
    datas = location.findAll('data')
    for d in datas:
        print(d.tmef.string, d.wf.string, d.tmn.string, d.tmx.string, sep="\n")

def printWeather():
    for location in locations:
        printCity(location)

def searchWeather():
    city = input("검색지역을 입력하세요 : ")
    cities = soup.findAll('city')
    for c in cities:
        if (c.string == city):
            printCity(c.parent)
            break
def showTop5():
    tmx = soup.findAll("tmx")
    sml = sorted(tmx, key=lambda n:n.string, reverse=True)[:5]
    for t in sml:
        print(t.parent.parent.city.string, t.parent.tmef.string, t.parent.wf.string, t.string)

while True:
    print("서울 경기 주간 예보")
    m = int(input("1. 서울 경기 지역 날씨 정보\n2. 지역 검색\n3. 최고기온 top5\n>"))
    {1:printWeather, 2:searchWeather, 3:showTop5}.get(m)()

webengine

2018년 6월 26일 화요일

Python BeautifulSoup

댓글 없음:

댓글 쓰기