Movie Descriptions - Open Assistant

python

!pip install pandas
!pip install scrapy
!pip install wikipedia

python

import requests
from scrapy import Selector
from urllib.parse import urljoin
import re
import pandas as pd
from random import choice
import wikipedia

python

def number_filtering(texto: str):
    numeros = re.findall(r"\d+", texto)
    resultado = "".join(numeros)
    return resultado

python

questions = [
    "Write a description of the film",
    "Write a description about the film",
    "I would like you to summarise the film",
    "Can you write a summary of the film",
    "Summarises the film"
    # ...
]

python

home_url = "https://letterboxd.com"
response = requests.get("https://letterboxd.com/films/ajax/popular/?esiAllowFilters=true")
df = pd.DataFrame(columns=["INSTRUCTION", "RESPONSE", "SOURCE"], index=None)
wikipedia.set_lang("en")

process = True
while process:
    selector1 = Selector(text=response.text)
    # Get film urls (72 per page)
    films_urls = selector1.css(".listitem.poster-container div::attr(data-target-link)").getall()
    for url in films_urls:
        # The process can take hours or days, so I recommend setting a limit on the number -
        # of data to be obtained. Example:
        if df.shape[0] == 5:
            process = False
            break

        response = requests.get(urljoin(home_url, url))
        selector = Selector(text=response.text)
        film_title = selector.css("h1.headline-1.js-widont.prettify ::text").get()
        try:
            # You can specify 'sentences: int' keyword to get less film description text
            film_description = wikipedia.summary(f"{film_title} film", auto_suggest=True)
        except wikipedia.DisambiguationError as e:
            print(e)
            results = wikipedia.search(film_title, 10, False)
            for res in results[0]:
                if film_title in res:
                    film_description = wikipedia.summary(f"{res}", auto_suggest=False)
        except wikipedia.PageError as e:
            film_description = None

        if film_description:
            df = df.append(
                {
                    "INSTRUCTION": f"{choice(questions)} {film_title}",
                    "RESPONSE": film_description,
                    "SOURCE": "Wikipedia & Letterbox",
                },
                ignore_index=True,
            )
        print(df)

    next_p_number = number_filtering(selector1.css(".paginate-nextprev a.next::attr(href)").get())
    if next_p_number:
        next_page = f"https://letterboxd.com/films/ajax/popular/page/{next_p_number}?esiAllowFilters=true"
        response = requests.get(urljoin(home_url, next_page))
    else:
        process = False
        break

python

df

python

# df.to_json("films.jsonl", orient='records', lines=True)
# df.to_parquet("films.parquet", row_group_size=100, engine="pyarrow")