This commit is contained in:
48
imdb_parsen/test.py
Normal file
48
imdb_parsen/test.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# Downloading imdb top 250 movie's data
|
||||
url = 'http://www.imdb.com/chart/top'
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
movies = soup.select('td.titleColumn')
|
||||
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
|
||||
ratings = [b.attrs.get('data-value')
|
||||
for b in soup.select('td.posterColumn span[name=ir]')]
|
||||
|
||||
|
||||
# create a empty list for storing
|
||||
# movie information
|
||||
list = []
|
||||
|
||||
# Iterating over movies to extract
|
||||
# each movie's details
|
||||
for index in range(0, len(movies)):
|
||||
|
||||
# Separating movie into: 'place',
|
||||
# 'title', 'year'
|
||||
movie_string = movies[index].get_text()
|
||||
movie = (' '.join(movie_string.split()).replace('.', ''))
|
||||
movie_title = movie[len(str(index))+1:-7]
|
||||
# year = re.search('\((.*?)\)', movie_string).group(1)
|
||||
place = movie[:len(str(index))-(len(movie))]
|
||||
data = {"place": place,
|
||||
"movie_title": movie_title,
|
||||
"rating": ratings[index],
|
||||
# "year": year,
|
||||
"star_cast": crew[index],
|
||||
}
|
||||
list.append(data)
|
||||
|
||||
# printing movie details with its rating.
|
||||
for movie in list:
|
||||
print(movie['place'], '-', movie['movie_title'],
|
||||
'Starring:', movie['star_cast'], movie['rating'])
|
||||
|
||||
|
||||
## .......##
|
||||
df = pd.DataFrame(list)
|
||||
df.to_csv('imdb_top_250_movies.csv', index=False)
|
||||
Reference in New Issue
Block a user