"""
Checks TextGrid for integrity.
Uses TextGrid parser from NLTK:
http://nltk.googlecode.com/svn/trunk/nltk_contrib/nltk_contrib/textgrid.py
This script is really suitable for the project I was working on. However,
the code can be used as an example.
More on http://www.languagebits.com/?p=749
"""
__author__ = 'Romeo Mlinar'
__mail__ = 'mlinar [a] languagebits.com'
__license__ = 'GNU General Public License version 3'
import os
import sys
from textgrid import TextGrid
PATH = r'D:\corpus\analyse'
def get_grid(f):
"""Read the file and return the content."""
c = file(os.path.join(PATH, f), 'r').readlines()
return ''.join(c)
def get_paths(path):
"""Get all textgrid files form the path"""
tgrid = []
fs = os.listdir(path)
for i in fs:
if os.path.splitext(i)[1] == '.TextGrid':
tgrid.append(i)
tgrid.sort()
return tgrid
def get_nonempty(tier):
"""Return the number of no-empty objects"""
i = 0
content = []
for item in tier.simple_transcript:
# Check if empty
if item[2] != '':
content.append(item)
return content
def get_checklist():
"""Get list used for checks"""
check = {}
f = file('diph.txt', 'r')
for line in f:
line = line.strip()
c = line.split('\t')
check[c[1]] = c[0]
return check
def get_text(tier):
"""Get text value from the tier"""
t = []
for i in tier:
t.append(i[2])
return t
def all_there(tier, t):
"""Check if all tier ids are in the tier"""
if t == 'diph':
ch = checklist.values()
elif t == 'word':
ch = checklist.keys()
else:
print 'Wrong tier name.'
sys.exit()
# Check now
items = []
i = 0
for item in tier:
i = i + 1
if item[2] not in ch:
print 'String "%s" is not allowed but found in tier "%s", at position %s.' \
% (item[2], t, i)
sys.exit()
items.append(item)
def all_matches(diph, word):
"""Check if diphthongs and words match"""
i = 0
for i in range(32):
if checklist[word[i][2]] != diph[i][2]:
print 'Mismatch: "%s" not allowed in "%s", at position %s.' \
% (diph[i][2], word[i][2], i+1)
print 'It should say "%s".' % checklist[word[i][2]]
sys.exit()
def unique(tier):
"""Must return 16"""
tc = get_text(tier)
u = tuple(set(tc))
if len(u) != 16:
print u
print "The diphthongs are not paired."
sys.exit()
def checkwords(tier):
"""Check if all words are present."""
w = get_text(tier)
for word in checklist.keys():
try:
w.remove(word)
except ValueError:
print 'Word "%s" is missing from this tier!' % word
sys.exit()
assert len(w) == 0
def check_integrity(f):
"""Check for the integrity"""
gridobj = TextGrid(get_grid(f))
# Check if tiers are OK
print "\tChecking proper tier names..."
assert gridobj.tiers[0].nameid == 'diph'
assert gridobj.tiers[1].nameid == 'point'
assert gridobj.tiers[2].nameid == 'word'
# Check number of words and diphthings
print "\tChecking if the tiers contain 32 items..."
diph, word = get_nonempty(gridobj.tiers[0]), get_nonempty(gridobj.tiers[2])
assert len(word) == 32
assert len(diph) == 32
print "\tChecking if all tiers have valid text..."
all_there(diph, 'diph')
all_there(word, 'word')
print "\tChecking if the diphthongs have pairs..."
unique(diph)
print "\tChecking if all words are present..."
checkwords(word)
print "\tChecking if the words and diphthongs match..."
all_matches(diph, word)
# Dictionary used for the testing
checklist = get_checklist()
def run():
"""Check all files in dir"""
print 'STARTING TEXTGRID CHECKS'
print 'The path is %s.' % PATH
# Count the number of files
paths = get_paths(PATH)
print "The number of text grids is %s." % len(paths)
print "Starting the loop..."
for f in paths:
print 'Checking file ', f
check_integrity(f)
print '\t', 'OK', f
run()