tutorials/flyte/notebook.ipynb
import daft
IO_CONFIG = daft.io.IOConfig(
s3=daft.io.S3Config(anonymous=True, region_name="us-west-2")
) # Use anonymous-mode for accessing AWS S3
PARQUET_PATH = "s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet"
parquet_df = daft.read_parquet(PARQUET_PATH, io_config=IO_CONFIG)
parquet_df = parquet_df.select(parquet_df["URL"], parquet_df["TEXT"], parquet_df["AESTHETIC_SCORE"])
parquet_df.collect()
parquet_df.show(5)
filtered_df = parquet_df.where(parquet_df["TEXT"].contains("darkness"))
filtered_df.show(5)
filtered_df = filtered_df.with_column(
"image",
filtered_df["URL"].download(on_error="null").decode_image(on_error="null"),
)
filtered_df.show(5)
filtered_df = filtered_df.with_column(
"resized_image",
filtered_df["image"].resize(32, 32),
)
filtered_df.show(5)
written_df = filtered_df.limit(5).write_parquet("resized_images.parquet")
written_df.collect()