1000-hours/public/jupyter-notebooks/chapter4.ipynb
import sys
import requests
import json
import vlc
import re
import random
from IPython.display import Audio
def load_json_database(url):
records = []
try:
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes
for line in response.iter_lines(decode_unicode=True):
if line:
try:
record = json.loads(line)
records.append(record)
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
except requests.exceptions.RequestException as e:
print(f"Error fetching data from URL: {e}")
return records
def search_in_json_database(database, search_word, region):
for record in database:
# 检查 word 字段是否匹配
if record.get('word') == search_word:
# 找到匹配项后,获取美式发音信息
pos_items = record.get('pos_items', [])
for pos_item in pos_items:
pronunciations = pos_item.get('pronunciations', [])
for pronunciation in pronunciations:
if pronunciation.get('region') == region:
# 找到美式发音,返回相关信息
return {
'pronunciation': pronunciation.get('pronunciation'),
'audio': pronunciation.get('audio')
}
# 如果没有找到匹配的 word 字段,返回 'not exist'
return 'not exist'
url = "https://raw.githubusercontent.com/zelic91/camdict/main/cam_dict.refined.json"
json_database = load_json_database(url)
text ="""
之前讲过,非重音音节里的元音可能会被弱化为 schwa `ə`…… 在自然语流中,连 schwa `ə` 都可能会被进一步弱化,变成非常轻的 `ɤ`。比如,常用词 reasonable,`/ˈriːzənəbəl/`,实际听到的常常是` /ˈriːzɤnəbəl/`……
"""
words = ""
phonetics_not_exist = ''
lines = text.split("\n")
for line in lines:
line = line.replace(" `", " <span class=\"pho\">")
line = line.replace("`", "</span>")
if '*' in line:
line = line.replace('"', "**")
examples = line.split("-")[1].split(",")
examples = [x.strip() for x in examples]
line = line.replace(" - ", "\n")
# print(examples)
# wrap examples in span,
for e in examples:
# join e in words with ','
words += e + ","
entry_us = search_in_json_database(json_database, e, 'us')
if entry_us == 'not exist':
phonetics = entry_us
phonetics_not_exist += f'{e},'
else:
phonetics = entry_us['pronunciation']
wrapped_e = f'\t- {e} <span class="pho alt">{phonetics}</span> <span class="speak-word-inline" data-audio-us-male="/audios/us/{e}-us-male.mp3" data-audio-us-female="/audios/us/{e}-us-female.mp3"></span>\n'
line = line.replace(e, wrapped_e).replace(',', '').strip()
print(line)
print(f'phonetics_not_exist: {phonetics_not_exist}')
print('\n'+words)
import asyncio
import edge_tts
import pygame
text = words.rstrip(",")
Wordlist = text.split(",")
# Wordlist = ['reasonable']
print(Wordlist)
for w in Wordlist:
# for VOICE in ['en-US-GuyNeural', 'en-US-JennyNeural', 'en-GB-RyanNeural', 'en-GB-SoniaNeural']:
for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:
w = w.strip()
# OUTPUT_FILE = f"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3"
OUTPUT_FILE = f"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3"
communicate = edge_tts.Communicate(w, VOICE)
await communicate.save(OUTPUT_FILE)
print(w)
print("Files created!")
text ="""
举个例子,*ichthyosaur*,这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,`/ˈɪk.θi.ə.sɔːr/` —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,`/ˈɪk.θiə.sɔːr/`,*ich* ⭤ `/ˈɪk/`, *thyo* ⭤ `/θiə/`, *saur* ⭤ `/sɔːr/`…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。
"""
sound_files = "ichthyosaur".split(",")
# regex, replace `...` with <span class="pho">...</span>
import re
text = text.replace("/", "")
text = re.sub(r'`([^`]+)`', r'<span class="pho alt">\1</span>', text)
print(text)
# get sound files
for s in sound_files:
w = s.strip()
text = text.replace(f"*{w}*", w)
# get the audio file
import asyncio
import edge_tts
import pygame
for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:
# OUTPUT_FILE = f"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3"
OUTPUT_FILE = f"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3"
communicate = edge_tts.Communicate(w, VOICE)
await communicate.save(OUTPUT_FILE)
print(f"Audio files for {w} created!")
entry_us = search_in_json_database(json_database, w, 'us')
if entry_us == 'not exist':
phonetics = entry_us
else:
phonetics = entry_us['pronunciation']
wrapped_w = f'*{w}* <span class="pho alt">{phonetics}</span> <span class="speak-word-inline" data-audio-us-male="/audios/us/{w}-us-male.mp3" data-audio-us-female="/audios/us/{w}-us-female.mp3"></span>\n'
text = text.replace(w, wrapped_w)
# get phonetics for the word
print(text)
# send text to clipboard
import pyperclip
pyperclip.copy(text.replace("\n", ""))
print("Text copied to clipboard!")
!pip install pyperclip
list = """
1. **airplane**
2. **airport**
3. **backyard**
4. **bedroom**
5. **birthday**
6. **blackboard**
7. **bookstore**
8. **brainstorm**
9. **breakfast**
10. **classroom**
11. **cupcake**
12. **daydream**
13. **dishwasher**
14. **doorbell**
15. **downtown**
16. **earthquake**
17. **everyday**
18. **eyewitness**
19. **firefighter**
20. **football**
21. **greenhouse**
22. **handwriting**
23. **headache**
24. **highway**
25. **homework**
26. **iceberg**
27. **jellyfish**
28. **laptop**
29. **lighthouse**
30. **mailbox**
31. **moonlight**
32. **notebook**
33. **nobody**
34. **pancake**
35. **postcard**
36. **rainbow**
37. **sailboat**
38. **sandbox**
39. **seashore**
40. **skateboard**
41. **snowflake**
42. **spaceship**
43. **sunflower**
44. **sunshine**
45. **superhero**
46. **tablecloth**
47. **toothbrush**
48. **toothpaste**
49. **typewriter**
50. **underwater**
51. **upstairs**
52. **volleyball**
53. **waterfall**
54. **watermelon**
55. **weekend**
56. **wheelchair**
57. **windmill**
58. **workshop**
"""
lines = list.split("\n")
for l in lines:
if l.strip() == "":
continue
# extract str between ** and **
word = re.search(r'\*\*(.*)\*\*', l).group(1)
import asyncio
import edge_tts
import pygame
for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:
# OUTPUT_FILE = f"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3"
OUTPUT_FILE = f"{word}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3"
communicate = edge_tts.Communicate(word, VOICE)
await communicate.save(OUTPUT_FILE)
# print(f"Audio files for {word} created!")
entry_us = search_in_json_database(json_database, word, 'us')
if entry_us == 'not exist':
phonetics = entry_us
else:
phonetics = entry_us['pronunciation']
wrapped_p = f' <span class="pho alt">{phonetics}</span> <span class="speak-word-inline" data-audio-us-male="/audios/us/{word}-us-male.mp3" data-audio-us-female="/audios/us/{word}-us-female.mp3"></span>'
l += wrapped_p
print(l)