1000-hours/public/jupyter-notebooks/spelling-rules.ipynb
import sys
import requests
import json
import vlc
import re
import random
from IPython.display import Audio
def load_json_database(url):
records = []
try:
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes
for line in response.iter_lines(decode_unicode=True):
if line:
try:
record = json.loads(line)
records.append(record)
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
except requests.exceptions.RequestException as e:
print(f"Error fetching data from URL: {e}")
return records
def search_in_json_database(database, search_word, region):
for record in database:
# 检查 word 字段是否匹配
if record.get('word') == search_word:
# 找到匹配项后,获取美式发音信息
pos_items = record.get('pos_items', [])
for pos_item in pos_items:
pronunciations = pos_item.get('pronunciations', [])
for pronunciation in pronunciations:
if pronunciation.get('region') == region:
# 找到美式发音,返回相关信息
return {
'pronunciation': pronunciation.get('pronunciation'),
'audio': pronunciation.get('audio')
}
# 如果没有找到匹配的 word 字段,返回 'not exist'
return 'not exist'
url = "https://raw.githubusercontent.com/zelic91/camdict/main/cam_dict.refined.json"
json_database = load_json_database(url)
text ="""
之前讲过,非重音音节里的元音可能会被弱化为 schwa `ə`…… 在自然语流中,连 schwa `ə` 都可能会被进一步弱化,变成非常轻的 `ɤ`。比如,常用词 reasonable,`/ˈriːzənəbəl/`,实际听到的常常是` /ˈriːzɤnəbəl/`……
"""
words = ""
phonetics_not_exist = ''
lines = text.split("\n")
for line in lines:
line = line.replace(" `", " <span class=\"pho\">")
line = line.replace("`", "</span>")
if '*' in line:
line = line.replace('"', "**")
examples = line.split("-")[1].split(",")
examples = [x.strip() for x in examples]
line = line.replace(" - ", "\n")
# print(examples)
# wrap examples in span,
for e in examples:
# join e in words with ','
words += e + ","
entry_us = search_in_json_database(json_database, e, 'us')
if entry_us == 'not exist':
phonetics = entry_us
phonetics_not_exist += f'{e},'
else:
phonetics = entry_us['pronunciation']
wrapped_e = f'\t- {e} <span class="pho alt">{phonetics}</span> <span class="speak-word-inline" data-audio-us-male="/audios/us/{e}-us-male.mp3" data-audio-us-female="/audios/us/{e}-us-female.mp3"></span>\n'
line = line.replace(e, wrapped_e).replace(',', '').strip()
print(line)
print(f'phonetics_not_exist: {phonetics_not_exist}')
print('\n'+words)
import asyncio
import edge_tts
import pygame
text = words.rstrip(",")
Wordlist = text.split(",")
# Wordlist = ['reasonable']
print(Wordlist)
for w in Wordlist:
# for VOICE in ['en-US-GuyNeural', 'en-US-JennyNeural', 'en-GB-RyanNeural', 'en-GB-SoniaNeural']:
for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:
w = w.strip()
# OUTPUT_FILE = f"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3"
OUTPUT_FILE = f"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3"
communicate = edge_tts.Communicate(w, VOICE)
await communicate.save(OUTPUT_FILE)
print(w)
print("Files created!")