from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
# Chromedriver 경로 설정
chromedriver_path = '.\chromedriver.exe'
# Service 객체 생성
service = Service(chromedriver_path)
# Options 객체 생성
options = Options()
# WebDriver 객체 생성
browser = webdriver.Chrome(service=service, options=options)
url = "https://youtube-rank.com/board/bbs/board.php?bo_table=youtube"
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
channel_list = soup.select('tr')
# 카테고리 정보 추출하기
category = channel.select('p.category')[0].text.strip()
print(category)
# [음악/댄스/가수]
# 채널명 찾아오기
title = channel.select('h1 > a')[0].text.strip()
# BLACKPINK
subscriber = channel.select('.subscriber_cnt')[0].text
view = channel.select('.view_cnt')[0].text
video = channel.select('.video_cnt')[0].text
print(subscriber)
print(view)
print(video)
# 9420만
# 360억2196만
# 597개
# 반복문으로 채널 정보 추출하기
channel_list = soup.select('tbody > tr')
for channel in channel_list:
title = channel.select('h1 > a')[0].text.strip()
category = channel.select('p.category')[0].text.strip()
subscriber = channel.select('.subscriber_cnt')[0].text
view = channel.select('.view_cnt')[0].text
video = channel.select('.video_cnt')[0].text
print(title, category, subscriber, view, video)
# 유튜브 랭킹 화면의 여러 페이지를 크롤링하기
results = []
for page in range(1,11):
url = f"https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page={page}"
browser.get(url)
time.sleep(2)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
channel_list = soup.select('form > table > tbody > tr')
for channel in channel_list:
title = channel.select('h1 > a')[0].text.strip()
category = channel.select('p.category')[0].text.strip()
subscriber = channel.select('.subscriber_cnt')[0].text
view = channel.select('.view_cnt')[0].text
video = channel.select('.video_cnt')[0].text
data = [title, category, subscriber, view, video]
results.append(data)
# 엑셀 파일로 저장하기
df = pd.DataFrame(results)
df.columns = ['title', 'category', 'subscriber', 'view', 'video']
df.head()
# df.to_excel('./files/youtube_rank.xlsx', index = False)
유튜브 랭킹 데이터 시각화하기
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import platform
if platform.system() == 'Windows':
path = 'c:/Windows/Fonts/malgun.ttf'
font_name = font_manager.FontProperties(fname = path).get_name()
rc('font', family = font_name)
elif platform.system() == 'Darwin':
rc('font', family = 'AppleGothic')
else:
print('Check your OS system')
df = pd.read_excel('./files/youtube_rank.xlsx')
df['replaced_subscriber'] = df['subscriber'].str.replace('만', '0000')
df['replaced_subscriber'] = df['replaced_subscriber'].astype('int')
# 피봇 테이블 생성
pivot_df = df.pivot_table(index = 'category', values = 'replaced_subscriber', aggfunc = ['sum','count'])
# 데이터프레임 칼럼명 변경
pivot_df.columns = ['subscriber_sum', 'category_count']
# 데이터프레임 인덱스 초기화
pivot_df = pivot_df.reset_index()
# 데이터프레임 내림차순 정렬하기
pivot_df = pivot_df.sort_values(by='subscriber_sum', ascending=False)
# 카테고리별 구독자 수
plt.figure(figsize = (30,10))
plt.pie(pivot_df['subscriber_sum'], labels=pivot_df['category'], autopct='%1.1f%%')
plt.show()
# 카테고리별 채널 수
pivot_df = pivot_df.sort_values(by='category_count', ascending=False)
pivot_df.head()
plt.figure(figsize = (30,10))
plt.pie(pivot_df['category_count'], labels=pivot_df['category'], autopct='%1.1f%%')
plt.show()