18082025

2025-08-18 16:08:22 +02:00
parent a692ac8b05
commit 6678aed520
11 changed files with 179 additions and 5 deletions
--- a/imdb_parsen/test.py
+++ b/imdb_parsen/test.py
@@ -0,0 +1,48 @@
+from bs4 import BeautifulSoup
+import requests
+import re
+import pandas as pd
+
+
+# Downloading imdb top 250 movie's data
+url = 'http://www.imdb.com/chart/top'
+response = requests.get(url)
+soup = BeautifulSoup(response.text, "html.parser")
+movies = soup.select('td.titleColumn')
+crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
+ratings = [b.attrs.get('data-value')
+           for b in soup.select('td.posterColumn span[name=ir]')]
+
+
+# create a empty list for storing
+# movie information
+list = []
+
+# Iterating over movies to extract
+# each movie's details
+for index in range(0, len(movies)):
+
+    # Separating movie into: 'place',
+    # 'title', 'year'
+    movie_string = movies[index].get_text()
+    movie = (' '.join(movie_string.split()).replace('.', ''))
+    movie_title = movie[len(str(index))+1:-7]
+    # year = re.search('\((.*?)\)', movie_string).group(1)
+    place = movie[:len(str(index))-(len(movie))]
+    data = {"place": place,
+            "movie_title": movie_title,
+            "rating": ratings[index],
+            # "year": year,
+            "star_cast": crew[index],
+            }
+    list.append(data)
+
+# printing movie details with its rating.
+for movie in list:
+    print(movie['place'], '-', movie['movie_title'],
+          'Starring:', movie['star_cast'], movie['rating'])
+
+
+## .......##
+df = pd.DataFrame(list)
+df.to_csv('imdb_top_250_movies.csv', index=False)