i-machine-think · Anand191 · Jul 4, 2018 · Jul 20, 2018
diff --git a/CommaiMini-^$/Long/Verify_Produce_longer.tsv b/CommaiMini-^$/Long/Verify_Produce_longer.tsv
diff --git a/CommaiMini-^$/Long/Verify_Produce_train.tsv b/CommaiMini-^$/Long/Verify_Produce_train.tsv
diff --git a/CommaiMini-^$/Long/Verify_Produce_unseen.tsv b/CommaiMini-^$/Long/Verify_Produce_unseen.tsv
diff --git a/CommaiMini-^$/Long/Verify_Produce_unseen_longer.tsv b/CommaiMini-^$/Long/Verify_Produce_unseen_longer.tsv
diff --git a/CommaiMini-^$/Long/Verify_Produce_validation.tsv b/CommaiMini-^$/Long/Verify_Produce_validation.tsv
diff --git a/CommaiMini-^$/Short/Test_data.tsv b/CommaiMini-^$/Short/Test_data.tsv
@@ -0,0 +1,25 @@
+not o and not 7 produce <eos>	erm erm O i	2 4 5 5 6
+e and ; verify U ] f ; / e : <eos>	erm erm erm yes	1 2 3 10 11
+not ! and $ and L verify Q c K [ p ! <eos>	erm erm erm erm no	2 4 5 6 12 13
+z or . or x produce <eos>	erm erm erm x	1 3 4 5 6
+N and j verify n ) ` ( j N Y <eos>	erm erm erm yes	1 2 3 10 11
+y or r verify O <eos>	erm erm erm no	1 2 3 4 5
+- and J and Q verify Q F 0 - J j n Q <eos>	erm erm erm erm yes	1 3 4 5 13 14
+T and > and a and not @ produce <eos>	erm erm erm erm T > a s	1 3 5 7 8 8 8 8 9
+not . and b produce <eos>	erm erm h b	2 3 4 4 5
+h and A produce <eos>	erm erm h A	1 2 3 3 4
+not 8 and r produce <eos>	erm erm n r	2 3 4 4 5
+E and not v and k and p produce <eos>	erm erm erm erm E z k p	1 4 6 7 8 8 8 8 9
+U and not r verify I r <eos>	erm erm erm no	1 3 4 6 7
+T and G and w produce <eos>	erm erm erm T G w	1 3 4 5 5 5 6
+1 and A and 5 verify + <eos>	erm erm erm erm no	1 3 4 5 6 7
+f and n and p verify p > Q n : f <eos>	erm erm erm erm yes	1 3 4 5 11 12
+2 or f produce <eos>	erm erm f	1 2 3 4
+& or e or @ produce <eos>	erm erm erm @ &	1 3 4 5 5 6
+0 verify q <eos>	erm erm no	0 1 2 3
+s or ' or O produce <eos>	erm erm erm O	1 3 4 5 6
+F or 0 verify B = <eos>	erm erm erm no	1 2 3 5 6
+/ and L and X and ' verify B l 2 E <eos>	erm erm erm erm erm no	1 3 5 6 7 11 12
+i or 7 produce <eos>	erm erm 7 i	1 2 3 3 4
+B or $ or Y produce <eos>	erm erm erm Y B	1 3 4 5 5 6
+0 or C verify _ 0 0 <eos>	erm erm erm yes	1 2 3 6 7
diff --git a/CommaiMini-^$/Short/Verify_Produce_longer.tsv b/CommaiMini-^$/Short/Verify_Produce_longer.tsv
diff --git a/CommaiMini-^$/Short/Verify_Produce_train.tsv b/CommaiMini-^$/Short/Verify_Produce_train.tsv
diff --git a/CommaiMini-^$/Short/Verify_Produce_unseen.tsv b/CommaiMini-^$/Short/Verify_Produce_unseen.tsv
diff --git a/CommaiMini-^$/Short/Verify_Produce_unseen_longer.tsv b/CommaiMini-^$/Short/Verify_Produce_unseen_longer.tsv
diff --git a/CommaiMini-^$/Short/Verify_Produce_validation.tsv b/CommaiMini-^$/Short/Verify_Produce_validation.tsv
diff --git a/scripts/dataset_statistics.py b/scripts/dataset_statistics.py
@@ -0,0 +1,88 @@
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+
+mfolder = '../CommaiMini-^$/New_AG'
+data_split = os.path.join(mfolder,'Long')
+
+
+def ops_count(data, name):
+    or_op = []
+    and_op = []
+    not_op = []
+    copy = []
+
+    for d in data:
+        if ('or' in d):
+            or_op.append(d)
+        elif('not' in d):
+            not_op.append(d)
+        elif('and' in d):
+            and_op.append(d)
+        else:
+            copy.append(d)
+
+    fig, ax = plt.subplots()
+    p1 = ax.bar([1,2,3, 4], [len(copy), len(or_op), len(and_op), len(not_op)])
+    ax.set_xticks([1,2,3, 4])
+    ax.set_xticklabels(('copy','or', 'and', 'not'))
+    ax.set_ylabel('Instances per operator')
+    ax.set_xlabel('Operators')
+    ax.set_title('{} case'.format(name))
+
+    for p in p1:
+        height = p.get_height()
+        ax.text(p.get_x() + p.get_width()/2., 0.90*height,
+                '%d' % int(height),
+                ha='center', va='bottom')
+    #plt.savefig(os.path.join(mfolder, 'Stats', '{}-eps.eps'.format(name)), format='eps')
+    plt.show()
+
+def file_stats(fname):
+    in_file = open(os.path.join(data_split, 'Verify_Produce_{}.tsv'.format(fname)), 'r')
+    all_lines = in_file.readlines()
+    data_arr = np.zeros((len(all_lines), 3), dtype=object)
+    err_chk = np.zeros((len(all_lines), 3), dtype=object)
+    for idx, line in enumerate(all_lines):
+        targets = line.strip('\n').split('\t')
+        for i, tgt in enumerate(targets):
+            data_arr[idx, i] = tgt.split(' ')
+            err_chk[idx, i] = tgt
+
+    verify = []
+    produce = []
+    for i in range(data_arr.shape[0]):
+        if('verify' in data_arr[i,0]):
+            verify.append(data_arr[i,0])
+        else:
+            produce.append(data_arr[i,0])
+
+    ops_count(verify, '{}_verify'.format(fname))
+    ops_count(produce, '{}_produce'.format(fname))
+
+    return err_chk
+
+fnames = ['train', 'validation', 'unseen', 'longer', 'unseen_longer']
+operators = ['and', 'or', 'not']
+
+err_report = []
+for fname in fnames:
+    err_arr = file_stats(fname)
+    err = []
+    for i in range(err_arr.shape[0]):
+        if('verify' in err_arr[i,0].split(' ')):
+            temp_str = err_arr[i,0].split('verify')[0]
+        else:
+            temp_str = err_arr[i,0].split('produce')[0]
+
+        if (temp_str[-1] in operators or temp_str[-1]==''):
+            err.append(err_arr[i,0])
+    err_report.append(err)
+
+print(err_report)
+
+
+
+
+
+
diff --git a/scripts/ng_produce.py b/scripts/ng_produce.py
@@ -0,0 +1,236 @@
+import numpy as np
+import random
+import argparse
+import string
+
+mfolder = '../CommaiMini-^$'
+try:
+    raw_input          # Python 2
+except NameError:
+    raw_input = input  # Python 3
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--max_train_com', type=int, help= 'max length of compositions in train', default=4)
+parser.add_argument('--max_test_com', type=int, help= 'max length of compositions in test', default=7)
+
+
+opt = parser.parse_args()
+
+alphabets = [string.printable[i] for i in range(len(string.printable)-6)]
+# pidx = int(len(alphabets)/2)
+# random.shuffle(alphabets)
+# subset1 = alphabets[:pidx]
+# subset2 = alphabets[pidx:]
+operators = ['and', 'or', 'not']
+ponder = 'erm'
+eois = '<eos>'
+def attn_list(ipt, opt):
+    temp_attn = []
+    pre_opt = []
+    ver_idx = ipt.index('produce')
+    temp_attn.extend([ver_idx - 1, ver_idx])
+    for i in range(len(temp_attn)):
+        pre_opt.append(ponder)
+    if ('not' not in ipt):
+        for o in opt:
+            temp_attn.append(ipt.index(o))
+    else:
+        not_idx = []
+        for i in range(1,len(ipt)):
+            if(ipt[i-1]=='not'):
+                not_idx.append(i)
+        for o in opt:
+            if(o in ipt):
+                temp_attn.append(ipt.index(o))
+            else:
+                temp_attn.append(not_idx[0])
+                not_idx.pop(0)
+    temp_attn.append(ipt.index(eois))
+    pre_opt.extend(opt)
+    return(temp_attn, pre_opt)
+
+
+def and_gate(ps, token):
+    temp_ipt = [] #[token]
+    for i, s in enumerate(ps):
+        temp_ipt.append(s)
+        if (i != len(ps) - 1):
+            temp_ipt.append('and')
+    random.shuffle(ps)
+    temp_ipt.append(token)
+    temp_ipt.append(eois)
+    temp_attn, pre_opt = attn_list(temp_ipt, ps)
+    return(pre_opt, temp_ipt, temp_attn)
+
+def or_gate(ps, token):
+    temp_ipt = [] #[token]
+    for i, s in enumerate(ps):
+        temp_ipt.append(s)
+        if (i != len(ps) - 1):
+            temp_ipt.append('or')
+    size = random.sample(np.arange(1, len(ps) + 1, dtype=int).tolist(), 1)
+    out_str = np.random.choice(ps, size=size, replace=False).tolist()
+    temp_ipt.append(token)
+    temp_ipt.append(eois)
+    temp_attn, pre_opt = attn_list(temp_ipt, out_str)
+    return(pre_opt, temp_ipt, temp_attn)
+
+def not_gate(ps, token):
+    temp_ipt = []
+    temp_opt = []
+    num_nots = random.sample(np.arange(1,len(ps)+1, dtype=int).tolist(),1)[0]
+    not_pfx = random.sample(ps, num_nots)
+    cvocab = list(set(alphabets) - set(not_pfx))
+    # for pf in not_pfx:
+    #     temp_alpha = list(set(alphabets) - set([pf]))
+    #     cvocab.append(temp_alpha)
+    for i, s in enumerate(ps):
+        if (s in not_pfx):
+            temp_ipt.append('not')
+            temp_ipt.append(s)
+            if (i != len(ps) - 1):
+                temp_ipt.append('and')
+            temp_opt.append(random.sample(cvocab, 1)[0])
+        else:
+            temp_ipt.append(s)
+            if (i != len(ps) - 1):
+                temp_ipt.append('and')
+            temp_opt.append(s)
+    temp_ipt.append(token)
+    temp_ipt.append(eois)
+    temp_attn, pre_opt = attn_list(temp_ipt, temp_opt)
+    return (pre_opt, temp_ipt, temp_attn)
+
+
+
+def io_strings(word, all_words, comp_len, token):
+    ipt = []
+    out = []
+    attn = []
+    comps = np.random.choice(comp_len, size=len(operators))
+    operations = np.random.choice(operators, size=len(operators), replace=False).tolist()
+    random.shuffle(operations)
+    for i in range(len(comps)):
+        ps = [word]
+        ps.extend(np.random.choice(all_words, size=comps[i]-1).tolist())
+        if (operations[i] == 'and'):
+            str_tup = and_gate(ps, token)
+        elif(operations[i] == 'or'):
+            str_tup = or_gate(ps, token)
+        else:
+            str_tup = not_gate(ps, token)
+        out.append(' '.join(map(str, str_tup[0])))
+        ipt.append(' '.join(map(str, str_tup[1])))
+        attn.append(' '.join(map(str, str_tup[2])))
+    return (ipt, out, attn)
+
+def train(words, size):
+    comp_lens = np.arange(2, opt.max_train_com+1, dtype=int).tolist()
+    data = np.zeros((size, 3), dtype=object)
+    idx = 0
+    try:
+        while idx < data.shape[0]:
+            random.shuffle(words)
+            for w in words:
+                tup = io_strings(w, words, comp_lens, 'produce')
+                data[idx:idx+len(tup[0]),0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if(idx > data.shape[0]-len(operators)):
+                    raise StopIteration()
+    except StopIteration:
+        pass
+
+    return data
+
+def unseen(words1, words2, size):
+    comp_lens = np.arange(2, opt.max_train_com+1, dtype=int).tolist()
+    data = np.zeros((size, 3), dtype=object)
+    idx = 0
+    try:
+        while idx < data.shape[0]:
+            random.shuffle(words1)
+            for w in words1:
+                tup = io_strings(w, words2, comp_lens, 'produce')
+                data[idx:idx + len(tup[0]), 0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if (idx > data.shape[0] - len(operators)):
+                    raise StopIteration()
+
+            random.shuffle(words2)
+            for w in words2:
+                tup = io_strings(w, words1, comp_lens, 'produce')
+                data[idx:idx + len(tup[0]), 0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if (idx > data.shape[0] - len(operators)):
+                    raise StopIteration()
+    except StopIteration:
+        pass
+
+    return data
+
+def longer(words, size):
+    comp_lens = np.arange(opt.max_train_com+1, opt.max_test_com+1, dtype=int).tolist()
+    data = np.zeros((size, 3), dtype=object)
+    idx = 0
+    try:
+        while idx < data.shape[0]:
+            random.shuffle(words)
+            for w in words:
+                tup = io_strings(w, words, comp_lens, 'produce')
+                data[idx:idx + len(tup[0]), 0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if (idx > data.shape[0] - len(operators)):
+                    raise StopIteration()
+    except StopIteration:
+        pass
+    return data
+
+def long_unseen(words1, words2, size):
+    comp_lens = np.arange(opt.max_train_com + 1, opt.max_test_com + 1, dtype=int).tolist()
+    data = np.zeros((size, 3), dtype=object)
+    idx = 0
+    try:
+        while idx < data.shape[0]:
+            random.shuffle(words1)
+            for w in words1:
+                tup = io_strings(w, words2, comp_lens, 'produce')
+                data[idx:idx + len(tup[0]), 0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if (idx > data.shape[0] - len(operators)):
+                    raise StopIteration()
+            random.shuffle(words2)
+            for w in words2:
+                tup = io_strings(w, words1, comp_lens, 'produce')
+                data[idx:idx + len(tup[0]), 0] = tup[0]
+                data[idx:idx + len(tup[0]), 1] = tup[1]
+                data[idx:idx + len(tup[0]), 2] = tup[2]
+                idx += len(tup[0])
+                if (idx > data.shape[0] - len(operators)):
+                    raise StopIteration()
+    except StopIteration:
+        pass
+
+    return data
+
+def get_data(num_samples, subset1, subset2):
+    tr1 = train(subset1, int(num_samples/2))
+    tr2 = train(subset2, int(num_samples/2))
+    train_data = np.vstack((tr1,tr2))
+    unseen_test = unseen(subset1, subset2, int(num_samples/10))
+    lg1 = longer(subset1, int(num_samples/20))
+    lg2 = longer(subset2, int(num_samples/20))
+    longer_test = np.vstack((lg1, lg2))
+    unseen_long_test = long_unseen(subset1, subset2, int(num_samples/10))
+
+    return(train_data, unseen_long_test, unseen_test, longer_test)
+