notebooks/data-augmentation/movie-descriptions/movie_descriptions.ipynb
!pip install pandas
!pip install scrapy
!pip install wikipedia
import requests
from scrapy import Selector
from urllib.parse import urljoin
import re
import pandas as pd
from random import choice
import wikipedia
def number_filtering(texto: str):
numeros = re.findall(r"\d+", texto)
resultado = "".join(numeros)
return resultado
questions = [
"Write a description of the film",
"Write a description about the film",
"I would like you to summarise the film",
"Can you write a summary of the film",
"Summarises the film"
# ...
]
home_url = "https://letterboxd.com"
response = requests.get("https://letterboxd.com/films/ajax/popular/?esiAllowFilters=true")
df = pd.DataFrame(columns=["INSTRUCTION", "RESPONSE", "SOURCE"], index=None)
wikipedia.set_lang("en")
process = True
while process:
selector1 = Selector(text=response.text)
# Get film urls (72 per page)
films_urls = selector1.css(".listitem.poster-container div::attr(data-target-link)").getall()
for url in films_urls:
# The process can take hours or days, so I recommend setting a limit on the number -
# of data to be obtained. Example:
if df.shape[0] == 5:
process = False
break
response = requests.get(urljoin(home_url, url))
selector = Selector(text=response.text)
film_title = selector.css("h1.headline-1.js-widont.prettify ::text").get()
try:
# You can specify 'sentences: int' keyword to get less film description text
film_description = wikipedia.summary(f"{film_title} film", auto_suggest=True)
except wikipedia.DisambiguationError as e:
print(e)
results = wikipedia.search(film_title, 10, False)
for res in results[0]:
if film_title in res:
film_description = wikipedia.summary(f"{res}", auto_suggest=False)
except wikipedia.PageError as e:
film_description = None
if film_description:
df = df.append(
{
"INSTRUCTION": f"{choice(questions)} {film_title}",
"RESPONSE": film_description,
"SOURCE": "Wikipedia & Letterbox",
},
ignore_index=True,
)
print(df)
next_p_number = number_filtering(selector1.css(".paginate-nextprev a.next::attr(href)").get())
if next_p_number:
next_page = f"https://letterboxd.com/films/ajax/popular/page/{next_p_number}?esiAllowFilters=true"
response = requests.get(urljoin(home_url, next_page))
else:
process = False
break
df
# df.to_json("films.jsonl", orient='records', lines=True)
# df.to_parquet("films.parquet", row_group_size=100, engine="pyarrow")