Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120,000 changes: 120,000 additions & 0 deletions CommaiMini-^$/Long/Verify_Produce_longer.tsv

Large diffs are not rendered by default.

108,000 changes: 108,000 additions & 0 deletions CommaiMini-^$/Long/Verify_Produce_train.tsv

Large diffs are not rendered by default.

120,000 changes: 120,000 additions & 0 deletions CommaiMini-^$/Long/Verify_Produce_unseen.tsv

Large diffs are not rendered by default.

120,000 changes: 120,000 additions & 0 deletions CommaiMini-^$/Long/Verify_Produce_unseen_longer.tsv

Large diffs are not rendered by default.

12,000 changes: 12,000 additions & 0 deletions CommaiMini-^$/Long/Verify_Produce_validation.tsv

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions CommaiMini-^$/Short/Test_data.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
not o and not 7 produce <eos> erm erm O i 2 4 5 5 6
e and ; verify U ] f ; / e : <eos> erm erm erm yes 1 2 3 10 11
not ! and $ and L verify Q c K [ p ! <eos> erm erm erm erm no 2 4 5 6 12 13
z or . or x produce <eos> erm erm erm x 1 3 4 5 6
N and j verify n ) ` ( j N Y <eos> erm erm erm yes 1 2 3 10 11
y or r verify O <eos> erm erm erm no 1 2 3 4 5
- and J and Q verify Q F 0 - J j n Q <eos> erm erm erm erm yes 1 3 4 5 13 14
T and > and a and not @ produce <eos> erm erm erm erm T > a s 1 3 5 7 8 8 8 8 9
not . and b produce <eos> erm erm h b 2 3 4 4 5
h and A produce <eos> erm erm h A 1 2 3 3 4
not 8 and r produce <eos> erm erm n r 2 3 4 4 5
E and not v and k and p produce <eos> erm erm erm erm E z k p 1 4 6 7 8 8 8 8 9
U and not r verify I r <eos> erm erm erm no 1 3 4 6 7
T and G and w produce <eos> erm erm erm T G w 1 3 4 5 5 5 6
1 and A and 5 verify + <eos> erm erm erm erm no 1 3 4 5 6 7
f and n and p verify p > Q n : f <eos> erm erm erm erm yes 1 3 4 5 11 12
2 or f produce <eos> erm erm f 1 2 3 4
& or e or @ produce <eos> erm erm erm @ & 1 3 4 5 5 6
0 verify q <eos> erm erm no 0 1 2 3
s or ' or O produce <eos> erm erm erm O 1 3 4 5 6
F or 0 verify B = <eos> erm erm erm no 1 2 3 5 6
/ and L and X and ' verify B l 2 E <eos> erm erm erm erm erm no 1 3 5 6 7 11 12
i or 7 produce <eos> erm erm 7 i 1 2 3 3 4
B or $ or Y produce <eos> erm erm erm Y B 1 3 4 5 5 6
0 or C verify _ 0 0 <eos> erm erm erm yes 1 2 3 6 7
12,000 changes: 12,000 additions & 0 deletions CommaiMini-^$/Short/Verify_Produce_longer.tsv

Large diffs are not rendered by default.

10,800 changes: 10,800 additions & 0 deletions CommaiMini-^$/Short/Verify_Produce_train.tsv

Large diffs are not rendered by default.

12,000 changes: 12,000 additions & 0 deletions CommaiMini-^$/Short/Verify_Produce_unseen.tsv

Large diffs are not rendered by default.

12,000 changes: 12,000 additions & 0 deletions CommaiMini-^$/Short/Verify_Produce_unseen_longer.tsv

Large diffs are not rendered by default.

1,200 changes: 1,200 additions & 0 deletions CommaiMini-^$/Short/Verify_Produce_validation.tsv

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions scripts/dataset_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import numpy as np
import os
import matplotlib.pyplot as plt

mfolder = '../CommaiMini-^$/New_AG'
data_split = os.path.join(mfolder,'Long')


def ops_count(data, name):
or_op = []
and_op = []
not_op = []
copy = []

for d in data:
if ('or' in d):
or_op.append(d)
elif('not' in d):
not_op.append(d)
elif('and' in d):
and_op.append(d)
else:
copy.append(d)

fig, ax = plt.subplots()
p1 = ax.bar([1,2,3, 4], [len(copy), len(or_op), len(and_op), len(not_op)])
ax.set_xticks([1,2,3, 4])
ax.set_xticklabels(('copy','or', 'and', 'not'))
ax.set_ylabel('Instances per operator')
ax.set_xlabel('Operators')
ax.set_title('{} case'.format(name))

for p in p1:
height = p.get_height()
ax.text(p.get_x() + p.get_width()/2., 0.90*height,
'%d' % int(height),
ha='center', va='bottom')
#plt.savefig(os.path.join(mfolder, 'Stats', '{}-eps.eps'.format(name)), format='eps')
plt.show()

def file_stats(fname):
in_file = open(os.path.join(data_split, 'Verify_Produce_{}.tsv'.format(fname)), 'r')
all_lines = in_file.readlines()
data_arr = np.zeros((len(all_lines), 3), dtype=object)
err_chk = np.zeros((len(all_lines), 3), dtype=object)
for idx, line in enumerate(all_lines):
targets = line.strip('\n').split('\t')
for i, tgt in enumerate(targets):
data_arr[idx, i] = tgt.split(' ')
err_chk[idx, i] = tgt

verify = []
produce = []
for i in range(data_arr.shape[0]):
if('verify' in data_arr[i,0]):
verify.append(data_arr[i,0])
else:
produce.append(data_arr[i,0])

ops_count(verify, '{}_verify'.format(fname))
ops_count(produce, '{}_produce'.format(fname))

return err_chk

fnames = ['train', 'validation', 'unseen', 'longer', 'unseen_longer']
operators = ['and', 'or', 'not']

err_report = []
for fname in fnames:
err_arr = file_stats(fname)
err = []
for i in range(err_arr.shape[0]):
if('verify' in err_arr[i,0].split(' ')):
temp_str = err_arr[i,0].split('verify')[0]
else:
temp_str = err_arr[i,0].split('produce')[0]

if (temp_str[-1] in operators or temp_str[-1]==''):
err.append(err_arr[i,0])
err_report.append(err)

print(err_report)






236 changes: 236 additions & 0 deletions scripts/ng_produce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import numpy as np
import random
import argparse
import string

mfolder = '../CommaiMini-^$'
try:
raw_input # Python 2
except NameError:
raw_input = input # Python 3

parser = argparse.ArgumentParser()
parser.add_argument('--max_train_com', type=int, help= 'max length of compositions in train', default=4)
parser.add_argument('--max_test_com', type=int, help= 'max length of compositions in test', default=7)


opt = parser.parse_args()

alphabets = [string.printable[i] for i in range(len(string.printable)-6)]
# pidx = int(len(alphabets)/2)
# random.shuffle(alphabets)
# subset1 = alphabets[:pidx]
# subset2 = alphabets[pidx:]
operators = ['and', 'or', 'not']
ponder = 'erm'
eois = '<eos>'
def attn_list(ipt, opt):
temp_attn = []
pre_opt = []
ver_idx = ipt.index('produce')
temp_attn.extend([ver_idx - 1, ver_idx])
for i in range(len(temp_attn)):
pre_opt.append(ponder)
if ('not' not in ipt):
for o in opt:
temp_attn.append(ipt.index(o))
else:
not_idx = []
for i in range(1,len(ipt)):
if(ipt[i-1]=='not'):
not_idx.append(i)
for o in opt:
if(o in ipt):
temp_attn.append(ipt.index(o))
else:
temp_attn.append(not_idx[0])
not_idx.pop(0)
temp_attn.append(ipt.index(eois))
pre_opt.extend(opt)
return(temp_attn, pre_opt)


def and_gate(ps, token):
temp_ipt = [] #[token]
for i, s in enumerate(ps):
temp_ipt.append(s)
if (i != len(ps) - 1):
temp_ipt.append('and')
random.shuffle(ps)
temp_ipt.append(token)
temp_ipt.append(eois)
temp_attn, pre_opt = attn_list(temp_ipt, ps)
return(pre_opt, temp_ipt, temp_attn)

def or_gate(ps, token):
temp_ipt = [] #[token]
for i, s in enumerate(ps):
temp_ipt.append(s)
if (i != len(ps) - 1):
temp_ipt.append('or')
size = random.sample(np.arange(1, len(ps) + 1, dtype=int).tolist(), 1)
out_str = np.random.choice(ps, size=size, replace=False).tolist()
temp_ipt.append(token)
temp_ipt.append(eois)
temp_attn, pre_opt = attn_list(temp_ipt, out_str)
return(pre_opt, temp_ipt, temp_attn)

def not_gate(ps, token):
temp_ipt = []
temp_opt = []
num_nots = random.sample(np.arange(1,len(ps)+1, dtype=int).tolist(),1)[0]
not_pfx = random.sample(ps, num_nots)
cvocab = list(set(alphabets) - set(not_pfx))
# for pf in not_pfx:
# temp_alpha = list(set(alphabets) - set([pf]))
# cvocab.append(temp_alpha)
for i, s in enumerate(ps):
if (s in not_pfx):
temp_ipt.append('not')
temp_ipt.append(s)
if (i != len(ps) - 1):
temp_ipt.append('and')
temp_opt.append(random.sample(cvocab, 1)[0])
else:
temp_ipt.append(s)
if (i != len(ps) - 1):
temp_ipt.append('and')
temp_opt.append(s)
temp_ipt.append(token)
temp_ipt.append(eois)
temp_attn, pre_opt = attn_list(temp_ipt, temp_opt)
return (pre_opt, temp_ipt, temp_attn)



def io_strings(word, all_words, comp_len, token):
ipt = []
out = []
attn = []
comps = np.random.choice(comp_len, size=len(operators))
operations = np.random.choice(operators, size=len(operators), replace=False).tolist()
random.shuffle(operations)
for i in range(len(comps)):
ps = [word]
ps.extend(np.random.choice(all_words, size=comps[i]-1).tolist())
if (operations[i] == 'and'):
str_tup = and_gate(ps, token)
elif(operations[i] == 'or'):
str_tup = or_gate(ps, token)
else:
str_tup = not_gate(ps, token)
out.append(' '.join(map(str, str_tup[0])))
ipt.append(' '.join(map(str, str_tup[1])))
attn.append(' '.join(map(str, str_tup[2])))
return (ipt, out, attn)

def train(words, size):
comp_lens = np.arange(2, opt.max_train_com+1, dtype=int).tolist()
data = np.zeros((size, 3), dtype=object)
idx = 0
try:
while idx < data.shape[0]:
random.shuffle(words)
for w in words:
tup = io_strings(w, words, comp_lens, 'produce')
data[idx:idx+len(tup[0]),0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if(idx > data.shape[0]-len(operators)):
raise StopIteration()
except StopIteration:
pass

return data

def unseen(words1, words2, size):
comp_lens = np.arange(2, opt.max_train_com+1, dtype=int).tolist()
data = np.zeros((size, 3), dtype=object)
idx = 0
try:
while idx < data.shape[0]:
random.shuffle(words1)
for w in words1:
tup = io_strings(w, words2, comp_lens, 'produce')
data[idx:idx + len(tup[0]), 0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if (idx > data.shape[0] - len(operators)):
raise StopIteration()

random.shuffle(words2)
for w in words2:
tup = io_strings(w, words1, comp_lens, 'produce')
data[idx:idx + len(tup[0]), 0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if (idx > data.shape[0] - len(operators)):
raise StopIteration()
except StopIteration:
pass

return data

def longer(words, size):
comp_lens = np.arange(opt.max_train_com+1, opt.max_test_com+1, dtype=int).tolist()
data = np.zeros((size, 3), dtype=object)
idx = 0
try:
while idx < data.shape[0]:
random.shuffle(words)
for w in words:
tup = io_strings(w, words, comp_lens, 'produce')
data[idx:idx + len(tup[0]), 0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if (idx > data.shape[0] - len(operators)):
raise StopIteration()
except StopIteration:
pass
return data

def long_unseen(words1, words2, size):
comp_lens = np.arange(opt.max_train_com + 1, opt.max_test_com + 1, dtype=int).tolist()
data = np.zeros((size, 3), dtype=object)
idx = 0
try:
while idx < data.shape[0]:
random.shuffle(words1)
for w in words1:
tup = io_strings(w, words2, comp_lens, 'produce')
data[idx:idx + len(tup[0]), 0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if (idx > data.shape[0] - len(operators)):
raise StopIteration()
random.shuffle(words2)
for w in words2:
tup = io_strings(w, words1, comp_lens, 'produce')
data[idx:idx + len(tup[0]), 0] = tup[0]
data[idx:idx + len(tup[0]), 1] = tup[1]
data[idx:idx + len(tup[0]), 2] = tup[2]
idx += len(tup[0])
if (idx > data.shape[0] - len(operators)):
raise StopIteration()
except StopIteration:
pass

return data

def get_data(num_samples, subset1, subset2):
tr1 = train(subset1, int(num_samples/2))
tr2 = train(subset2, int(num_samples/2))
train_data = np.vstack((tr1,tr2))
unseen_test = unseen(subset1, subset2, int(num_samples/10))
lg1 = longer(subset1, int(num_samples/20))
lg2 = longer(subset2, int(num_samples/20))
longer_test = np.vstack((lg1, lg2))
unseen_long_test = long_unseen(subset1, subset2, int(num_samples/10))

return(train_data, unseen_long_test, unseen_test, longer_test)

Loading