Back to Everyone Can Use English

Eng To Ipa

1000-hours/public/jupyter-notebooks/eng-to-ipa.ipynb

0.7.96.4 KB
Original Source
python
%pip install eng-to-ipa pypandoc nltk
python
import eng_to_ipa as ipa
text = """
but
"""
ipa.convert(text)
python
ipa.convert("ned")
python
import eng_to_ipa as ipa

def phonemes_complete(text):
    phonemes = ipa.convert(text)
    
    # CMU Pronuncing Dictionary uses 39 phonemes, 
    # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
    # 
    # /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
    # ... so there are still 39 phonemes in this module.
    ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
    ipa_consonants = "b,tʃ,d,ð,f,g,h,dʒ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
    ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert all phonemes into a list
    
    complete = True
    missing_phonemes = []
    for i in ipa_all:
        if i not in phonemes:
            complete = False
            missing_phonemes.append(i)
    if complete:
        print("The text includes all 38 phonemes...")
    else:
        print(f"Phonems that are not included in this text:\n{missing_phonemes}...")

    return complete


text = """
`Are those shy Eurasian footwear, cowboy chaps, or jolly earthmoving headgear?`
"""
print(ipa.convert(text))

phonemes_complete(text)
python
# check paragraphs
# %pip install eng_to_ipa pypandoc

import eng_to_ipa as ipa
import pypandoc

def phonemes_complete(text):
    
    phonemes = ipa.convert(text)
    
    # CMU Pronuncing Dictionary uses 39 phonemes, 
    # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
    # 
    # /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
    # ... so there are still 39 phonemes in this module.
    ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
    ipa_consonants = "b,tʃ,d,ð,f,g,h,dʒ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
    ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
    
    complete = True
    missing_phonemes = []
    for i in ipa_all:
        if i not in phonemes:
            complete = False
            missing_phonemes.append(i)
    # if complete:
    #     print("The text includes all 38 phonemes...")
    # else:
        # print(f"Phonems that are not included in this text:\n{missing_phonemes}...")

    return complete

# Convert the ePub file to plain text using pypandoc.
epub_path = "/Users/joker/Desktop/2023-best-3-books/Peak/Peak.epub"
output_txt = "peak.txt"

output = pypandoc.convert_file(epub_path, 'plain', outputfile=output_txt)

# Read the converted text file.
with open(output_txt, 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text into paragraphs.
paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by two newlines.

i = 0

for p in paragraphs:
    if len(p) < 20: # skip titles...
        continue
    if phonemes_complete(p):
        i += 1
        print(
f'''
### ({i}) ###
{p}
{'-'*80}'''
        )
        

print(f'{i} paragraphs are phoneme complete.')
python
# check sentences
# %pip install eng_to_ipa pypandoc nltk

import eng_to_ipa as ipa
import pypandoc

nltk.download('punkt')

def phonemes_complete(text):
    
    phonemes = ipa.convert(text)
    
    # CMU Pronuncing Dictionary uses 39 phonemes, 
    # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
    # 
    # /ʌ/ is transformed into /ə/ in this module, and /ə:/ to /ər/, /ɔ:/ to /ɔr/
    # ... so there are still 39 phonemes in this module.
    ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
    ipa_consonants = "b,tʃ,d,ð,f,g,h,dʒ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
    ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
    
    complete = True
    missing_phonemes = []
    for i in ipa_all:
        if i not in phonemes:
            complete = False
            missing_phonemes.append(i)
    # if complete:
    #     print("The text includes all 38 phonemes...")
    # else:
        # print(f"Phonems that are not included in this text:\n{missing_phonemes}...")

    return complete

# Convert the ePub file to plain text using pypandoc.
epub_path = "/Users/joker/Desktop/2023-best-3-books/Peak/Peak.epub"
output_txt = "peak.txt"

output = pypandoc.convert_file(epub_path, 'plain', outputfile=output_txt)

# Read the converted text file.
with open(output_txt, 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text into sentences.
sentences = nltk.tokenize.sent_tokenize(text)

i = 0

for s in sentences:
    if len(s) < 20:
        continue
    if phonemes_complete(s):
        i += 1
        print(
f'''
### ({i}) ###
{s}
{'-'*80}
'''
        )

print(f'{i} paragraphs are phoneme complete.')
python
phonemes = ['i:', 'ɪ', 'e', 'æ', 'ɑ:', 'ɒ', 'ɔ:', 'ʌ', 'ʊ', 'u:', 'ə', 'ɜːr', # Monophthongs
 'eɪ', 'aɪ', 'ɔɪ', 'aʊ', 'əʊ', 'ɪə', 'eə', 'ʊə', # Diphthongs
 'p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'm', 'n', 'ŋ', 'l', 'r', 'w', 'j', 'tʃ', 'dʒ'] # Consonants

print(len(phonemes))
python
ipa_vowels = "ɛ,æ,ə,ɑr,ər,ɪ,i,ɔ,ɔr,ʊ,u,aɪ,eɪ,aʊ,oʊ,ɔɪ"
ipa_consonants = "b,tʃ,d,ð,f,g,h,dʒ,k,l,m,n,ŋ,p,s,ʃ,t,θ,v,w,j,z,ʒ"
ipa_all = ipa_vowels.split(",") + ipa_consonants.split(",") # convert them into a list
print(len(ipa_all))

python
ipa.convert('''
In particular, the researchers counted the number of mistakes a student
made in practicing a piece the first time and the second time and used
the improvement from the first time to the second as a measurement of
how effectively the student was practicing. They found a wide variation
in the amount of improvement. Of all the students they studied, a female
cornet player in her first year of learning the instrument made the most
mistakes: 11 per minute, on average, on the first times playing pieces
during practice sessions. On the second time through, she was still
making the same mistakes 70 percent of the time—noticing and correcting
only 3 out of every 10 mistakes. By contrast, the best first-year
player, a boy who was learning the saxophone, made only 1.4 mistakes per
minute on his first times through. And on the second times through, he
was making the same mistakes only 20 percent of the time—correcting 8
out of every 10 mistakes. The difference in the percentage of
corrections is particularly striking because the saxophone player was
already making many fewer mistakes, so he had much less room for
improvement.''')