Back to Everyone Can Use English

get md files in md_path

1000-hours/public/jupyter-notebooks/check-media.ipynb

0.7.9955 B
Original Source
python
import os
md_path = "../../sounds-of-american-english/"
mp3_path = "../audios/"
# get md files in md_path
md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]
# get all mp3 files in mp3_path
mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]
# print(len(mp3_files))

# read md files, and get all sub-string between "audios/" and ".mp3", using regex
import re
audios_in_md = []
for md_file in md_files:
    with open(md_path + md_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if "audios/" in line:
                audios_in_md += re.findall(r'audios/(.*?).mp3', line)

# remove duplicates
audios_in_md = list(set(audios_in_md))
# print(len(audios_in_md))

for audio in audios_in_md:
    if not f'{audio}.mp3' in mp3_files:
        print(f'{audio.split("-")[0].strip()},')

# for mp3 in mp3_files:
#     if not mp3.replace('.mp3', '') in audios_in_md:
#         print(mp3)