Back to Open Assistant

Movie Descriptions

notebooks/data-augmentation/movie-descriptions/movie_descriptions.ipynb

0.0.13.0 KB
Original Source

python
!pip install pandas
!pip install scrapy
!pip install wikipedia
python
import requests
from scrapy import Selector
from urllib.parse import urljoin
import re
import pandas as pd
from random import choice
import wikipedia
python
def number_filtering(texto: str):
    numeros = re.findall(r"\d+", texto)
    resultado = "".join(numeros)
    return resultado
python
questions = [
    "Write a description of the film",
    "Write a description about the film",
    "I would like you to summarise the film",
    "Can you write a summary of the film",
    "Summarises the film"
    # ...
]
python
home_url = "https://letterboxd.com"
response = requests.get("https://letterboxd.com/films/ajax/popular/?esiAllowFilters=true")
df = pd.DataFrame(columns=["INSTRUCTION", "RESPONSE", "SOURCE"], index=None)
wikipedia.set_lang("en")

process = True
while process:
    selector1 = Selector(text=response.text)
    # Get film urls (72 per page)
    films_urls = selector1.css(".listitem.poster-container div::attr(data-target-link)").getall()
    for url in films_urls:
        # The process can take hours or days, so I recommend setting a limit on the number -
        # of data to be obtained. Example:
        if df.shape[0] == 5:
            process = False
            break

        response = requests.get(urljoin(home_url, url))
        selector = Selector(text=response.text)
        film_title = selector.css("h1.headline-1.js-widont.prettify ::text").get()
        try:
            # You can specify 'sentences: int' keyword to get less film description text
            film_description = wikipedia.summary(f"{film_title} film", auto_suggest=True)
        except wikipedia.DisambiguationError as e:
            print(e)
            results = wikipedia.search(film_title, 10, False)
            for res in results[0]:
                if film_title in res:
                    film_description = wikipedia.summary(f"{res}", auto_suggest=False)
        except wikipedia.PageError as e:
            film_description = None

        if film_description:
            df = df.append(
                {
                    "INSTRUCTION": f"{choice(questions)} {film_title}",
                    "RESPONSE": film_description,
                    "SOURCE": "Wikipedia & Letterbox",
                },
                ignore_index=True,
            )
        print(df)

    next_p_number = number_filtering(selector1.css(".paginate-nextprev a.next::attr(href)").get())
    if next_p_number:
        next_page = f"https://letterboxd.com/films/ajax/popular/page/{next_p_number}?esiAllowFilters=true"
        response = requests.get(urljoin(home_url, next_page))
    else:
        process = False
        break
python
df
python
# df.to_json("films.jsonl", orient='records', lines=True)
# df.to_parquet("films.parquet", row_group_size=100, engine="pyarrow")