new-edition-drafts/jupyter-notebooks/eng-to-ipa.ipynb
%pip install eng-to-ipa pypandoc nltk
import eng_to_ipa as ipa
text = """
but
"""
ipa.convert(text)
ipa.convert("ned")
import eng_to_ipa as ipa
def phonemes_complete(text):
phonemes = ipa.convert(text)
# CMU Pronuncing Dictionary uses 39 phonemes,
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict
#
# /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
# ... so there are still 39 phonemes in this module.
ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
ipa_consonants = "b,ʧ,d,ð,f,g,h,ʤ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert all phonemes into a list
complete = True
missing_phonemes = []
for i in ipa_all:
if i not in phonemes:
complete = False
missing_phonemes.append(i)
if complete:
print("The text includes all 38 phonemes...")
else:
print(f"Phonems that are not included in this text:\n{missing_phonemes}...")
return complete
text = """
`Are those shy Eurasian footwear, cowboy chaps, or jolly earthmoving headgear?`
"""
print(ipa.convert(text))
phonemes_complete(text)
# check paragraphs
# %pip install eng_to_ipa pypandoc
import eng_to_ipa as ipa
import pypandoc
def phonemes_complete(text):
phonemes = ipa.convert(text)
# CMU Pronuncing Dictionary uses 39 phonemes,
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict
#
# /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
# ... so there are still 39 phonemes in this module.
ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
ipa_consonants = "b,ʧ,d,ð,f,g,h,ʤ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
complete = True
missing_phonemes = []
for i in ipa_all:
if i not in phonemes:
complete = False
missing_phonemes.append(i)
# if complete:
# print("The text includes all 38 phonemes...")
# else:
# print(f"Phonems that are not included in this text:\n{missing_phonemes}...")
return complete
# Convert the ePub file to plain text using pypandoc.
epub_path = "/Users/joker/Desktop/2023-best-3-books/Peak/Peak.epub"
output_txt = "peak.txt"
output = pypandoc.convert_file(epub_path, 'plain', outputfile=output_txt)
# Read the converted text file.
with open(output_txt, 'r', encoding='utf-8') as file:
text = file.read()
# Split the text into paragraphs.
paragraphs = text.split('\n\n') # Assuming paragraphs are separated by two newlines.
i = 0
for p in paragraphs:
if len(p) < 20: # skip titles...
continue
if phonemes_complete(p):
i += 1
print(
f'''
### ({i}) ###
{p}
{'-'*80}'''
)
print(f'{i} paragraphs are phoneme complete.')
# check sentences
# %pip install eng_to_ipa pypandoc nltk
import eng_to_ipa as ipa
import pypandoc
nltk.download('punkt')
def phonemes_complete(text):
phonemes = ipa.convert(text)
# CMU Pronuncing Dictionary uses 39 phonemes,
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict
#
# /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
# ... so there are still 39 phonemes in this module.
ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
ipa_consonants = "b,ʧ,d,ð,f,g,h,ʤ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
complete = True
missing_phonemes = []
for i in ipa_all:
if i not in phonemes:
complete = False
missing_phonemes.append(i)
# if complete:
# print("The text includes all 38 phonemes...")
# else:
# print(f"Phonems that are not included in this text:\n{missing_phonemes}...")
return complete
# Convert the ePub file to plain text using pypandoc.
epub_path = "/Users/joker/Desktop/2023-best-3-books/Peak/Peak.epub"
output_txt = "peak.txt"
output = pypandoc.convert_file(epub_path, 'plain', outputfile=output_txt)
# Read the converted text file.
with open(output_txt, 'r', encoding='utf-8') as file:
text = file.read()
# Split the text into sentences.
sentences = nltk.tokenize.sent_tokenize(text)
i = 0
for s in sentences:
if len(s) < 20:
continue
if phonemes_complete(s):
i += 1
print(
f'''
### ({i}) ###
{s}
{'-'*80}
'''
)
print(f'{i} paragraphs are phoneme complete.')
phonemes = ['i:', 'ɪ', 'e', 'æ', 'ɑ:', 'ɒ', 'ɔ:', 'ʌ', 'ʊ', 'u:', 'ə', 'ɜːr', # Monophthongs
'eɪ', 'aɪ', 'ɔɪ', 'aʊ', 'əʊ', 'ɪə', 'eə', 'ʊə', # Diphthongs
'p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'm', 'n', 'ŋ', 'l', 'r', 'w', 'j', 'tʃ', 'dʒ'] # Consonants
print(len(phonemes))
ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
ipa_consonants = "b,ʧ,d,ð,f,g,h,ʤ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
print(len(ipa_all))
ipa.convert('''
In particular, the researchers counted the number of mistakes a student
made in practicing a piece the first time and the second time and used
the improvement from the first time to the second as a measurement of
how effectively the student was practicing. They found a wide variation
in the amount of improvement. Of all the students they studied, a female
cornet player in her first year of learning the instrument made the most
mistakes: 11 per minute, on average, on the first times playing pieces
during practice sessions. On the second time through, she was still
making the same mistakes 70 percent of the time—noticing and correcting
only 3 out of every 10 mistakes. By contrast, the best first-year
player, a boy who was learning the saxophone, made only 1.4 mistakes per
minute on his first times through. And on the second times through, he
was making the same mistakes only 20 percent of the time—correcting 8
out of every 10 mistakes. The difference in the percentage of
corrections is particularly striking because the saxophone player was
already making many fewer mistakes, so he had much less room for
improvement.''')