[웹 크롤링] 유튜브 랭킹 데이터 수집과 시각화

데이터 분석/Python 2024. 7. 6. 21:56

from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup 
import time
import pandas as pd

# Chromedriver 경로 설정
chromedriver_path = '.\chromedriver.exe'

# Service 객체 생성
service = Service(chromedriver_path)

# Options 객체 생성
options = Options()

# WebDriver 객체 생성
browser = webdriver.Chrome(service=service, options=options)

url = "https://youtube-rank.com/board/bbs/board.php?bo_table=youtube" 
browser.get(url)

html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

channel_list = soup.select('tr') 

# 카테고리 정보 추출하기
category = channel.select('p.category')[0].text.strip()
print(category)
# [음악/댄스/가수]

# 채널명 찾아오기
title = channel.select('h1 > a')[0].text.strip()
# BLACKPINK

subscriber = channel.select('.subscriber_cnt')[0].text
view = channel.select('.view_cnt')[0].text
video = channel.select('.video_cnt')[0].text

print(subscriber)
print(view)
print(video)
# 9420만
# 360억2196만
# 597개

# 반복문으로 채널 정보 추출하기
channel_list = soup.select('tbody > tr') 
for channel in channel_list:
    title = channel.select('h1 > a')[0].text.strip()
    category = channel.select('p.category')[0].text.strip()
    subscriber = channel.select('.subscriber_cnt')[0].text
    view = channel.select('.view_cnt')[0].text
    video = channel.select('.video_cnt')[0].text 
    print(title, category, subscriber, view, video)
    
# 유튜브 랭킹 화면의 여러 페이지를 크롤링하기
results = []
for page in range(1,11):
    url = f"https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page={page}" 
    browser.get(url)
    time.sleep(2)
    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')
    channel_list = soup.select('form > table > tbody > tr')
    for channel in channel_list:
        title = channel.select('h1 > a')[0].text.strip() 
        category = channel.select('p.category')[0].text.strip()
        subscriber = channel.select('.subscriber_cnt')[0].text 
        view = channel.select('.view_cnt')[0].text
        video = channel.select('.video_cnt')[0].text
        data = [title, category, subscriber, view, video]
        results.append(data)
        
# 엑셀 파일로 저장하기
df = pd.DataFrame(results)
df.columns = ['title', 'category', 'subscriber', 'view', 'video']
df.head()
# df.to_excel('./files/youtube_rank.xlsx', index = False)

유튜브 랭킹 데이터 시각화하기

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import platform
if platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname = path).get_name()
    rc('font', family = font_name)
elif platform.system() == 'Darwin':
    rc('font', family = 'AppleGothic')
else:
    print('Check your OS system')
    
df = pd.read_excel('./files/youtube_rank.xlsx')
df['replaced_subscriber'] = df['subscriber'].str.replace('만', '0000')
df['replaced_subscriber'] = df['replaced_subscriber'].astype('int')

# 피봇 테이블 생성
pivot_df = df.pivot_table(index = 'category', values = 'replaced_subscriber', aggfunc = ['sum','count'])

# 데이터프레임 칼럼명 변경
pivot_df.columns = ['subscriber_sum', 'category_count']

# 데이터프레임 인덱스 초기화
pivot_df = pivot_df.reset_index()

# 데이터프레임 내림차순 정렬하기
pivot_df = pivot_df.sort_values(by='subscriber_sum', ascending=False)

# 카테고리별 구독자 수
plt.figure(figsize = (30,10))
plt.pie(pivot_df['subscriber_sum'], labels=pivot_df['category'], autopct='%1.1f%%')
plt.show()

# 카테고리별 채널 수
pivot_df = pivot_df.sort_values(by='category_count', ascending=False)
pivot_df.head()
plt.figure(figsize = (30,10))
plt.pie(pivot_df['category_count'], labels=pivot_df['category'], autopct='%1.1f%%')
plt.show()

'데이터 분석 > Python' 카테고리의 다른 글

[웹 크롤링] 멜론 노래 순위 정보 크롤링 (0)	2024.07.06
[pandas] 기초 (0)	2024.07.06

ABOUT ME

letsfuture letsfuture

유튜브 랭킹 데이터 시각화하기

'데이터 분석 > Python' 카테고리의 다른 글

티스토리툴바

ABOUT ME

유튜브 랭킹 데이터 시각화하기

'데이터 분석 > Python' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바