diff --git a/script/dfs_traversal.py b/script/dfs_traversal.py
index 96ef25b9..9e74aa18 100644
--- a/script/dfs_traversal.py
+++ b/script/dfs_traversal.py
@@ -5,8 +5,8 @@
 import subprocess
 import os
 
-path = 'resources/data/python/final/jsonl/valid_old/python_valid_0.jsonl.gz'
-s_path = 'resources/data/python/final/jsonl/valid/python_valid_0_updated.jsonl.gz'
+path = '../resources/data/python/final/jsonl/valid_old/python_valid_0.jsonl.gz'
+s_path = '../resources/data/python/final/jsonl/valid/python_valid_0_updated.jsonl.gz'
 
 a = RichPath.create(path)
 s = RichPath.create(s_path)
@@ -14,6 +14,7 @@
 print('started')
 b = list(a.read_as_jsonl())
 
+c=[]
 
 count = 0
 def convert_code_to_tokens(code):
@@ -48,7 +49,7 @@ def convert_code_to_tokens(code):
 for idx, sample in enumerate(b):
     print("sample {} in progress".format(idx))
 #    print(sample['code'])
-    if idx==3282:
+    if idx==5306:
         print(sample['code'])
 
     tokenization = convert_code_to_tokens(sample['code'])
@@ -56,12 +57,13 @@ def convert_code_to_tokens(code):
         templist.append(idx)
     else:
         b[idx]['code_tokens'] = tokenization
+        c.append(b[idx])
     # tree = my_ast.parse(sample['code'])
     # an = SourceGenerator('    ')
     # an.visit(tree)
     # b[idx]['code_tokens'] = an.result
 
-s.save_as_compressed_file(b)
+s.save_as_compressed_file(c)
 print('finished', templist, len(templist), tokenization)
 
 
diff --git a/script/dfs_traversal_2.py b/script/dfs_traversal_2.py
index 97a56e45..d250e102 100644
--- a/script/dfs_traversal_2.py
+++ b/script/dfs_traversal_2.py
@@ -1,7 +1,4 @@
-from yapf.yapflib import pytree_utils
-
 # from src.dpu_utils.utils import RichPath
-from src.utils.my_utils import DotDict
 from src.utils import my_ast
 from src.utils.codegen2 import *
 import json_lines
diff --git a/script/parent_node_pairs.py b/script/parent_node_pairs.py
new file mode 100644
index 00000000..96e447d0
--- /dev/null
+++ b/script/parent_node_pairs.py
@@ -0,0 +1,115 @@
+
+from dpu_utils.utils import RichPath
+from src.utils import my_ast
+from src.utils.codegen import *
+import subprocess
+
+from .parent_node_parse_helpers import dfs_traversal_with_parents
+import pandas as pd
+import os
+
+
+
+count = 0
+def convert_code_to_tokens(code):
+    global count
+    tree =''
+    # tree = my_ast.parse(code)
+
+    try:
+        tree = my_ast.parse(code)
+    except:
+        try:
+            f = open('temp.py', 'w+')
+            f.write(code)
+            f.close()
+            subprocess.run(['2to3', '-w', 'temp.py'])
+            f = open('temp.py', 'r')
+            code = f.read()
+            # print(code)
+            tree = my_ast.parse(code)
+            # os.rmdir('temp.py')
+        except:
+            pass
+    if tree!='' and tree != None:
+        return dfs_traversal_with_parents(tree)
+    else:
+        return [], []
+#
+
+
+from pprint import pprint
+if __name__=='__main__':
+    print('something')
+
+    #[26045, 28475]
+
+    path = 'resources/data/python/final/jsonl/old_train/python_train_0.jsonl.gz'
+    s_path = 'resources/data/python/final/jsonl/train/python_train_0_dfs_parent.jsonl.gz'
+
+    a = RichPath.create(path)
+    s = RichPath.create(s_path)
+
+    print('started')
+    b = list(a.read_as_jsonl())
+
+    b = sorted(b, key=lambda v: len(v['code_tokens']))
+    templist = []
+    c = []
+    for idx, sample in enumerate(b):
+        print("sample {} in progress".format(idx))
+        # print(sample['code'])
+
+        if idx == 19 or sample['sha']=='618d6bff71073c8c93501ab7392c3cc579730f0b':
+            print(sample['code'])
+
+        dfs, parent_dfs = convert_code_to_tokens(sample['code'])
+        if dfs == [] or parent_dfs==[]:
+            templist.append(idx)
+        else:
+            b[idx]['code_tokens'] = dfs
+            b[idx]['parent_dfs'] = parent_dfs
+            c.append(b[idx])
+
+    s.save_as_compressed_file(c)
+    #     df = pd.DataFrame([dfs, parent_dfs])
+    #     print(parent_dfs)
+    print('finished', templist, len(templist), len(c))
+
+
+    # code= '''def f(a, b=1, c=2, *d, e, f=3, **g):
+    #              pass'''
+    #
+    # code = b[2]['code']
+    # print(code)
+    # code = '''ip = socket.gethostbyname(host)'''
+    #
+    # code = '''ip = socket.gethostbyname(host)\n[ port , request_size , num_requests , num_conns ] = map (
+    # string .atoi , sys . argv [2:]
+    # )\nchain = build_request_chain ( num_requests , host , request_size )'''
+
+    # code = '''from foo import bar as b, car as c, dar as d'''
+    # print(convert_code_to_tokens(code))
+
+#     code ='''print('something')
+# try:
+#     a+1
+# except IOError:
+#     return 1
+# else:
+#     a+2
+# finally:
+#     return 2'''
+
+
+
+
+#     # code = '''func(a, b=c, *d, **e)'''
+#     # a, b = parse_file_with_parents(code)
+#     # df = pd.DataFrame([a, b])
+#     # print(df.T)
+#
+#     result_tree = parse_file_with_parents(code)
+    # #
+    # # # print(pd.read_json(result_tree))
+    # pprint(result_tree)
diff --git a/script/parent_node_parse_helpers.py b/script/parent_node_parse_helpers.py
new file mode 100644
index 00000000..09bcb00f
--- /dev/null
+++ b/script/parent_node_parse_helpers.py
@@ -0,0 +1,337 @@
+
+unicode = lambda s: str(s)
+import ast
+from pprint import pprint
+import pandas as pd
+
+def create_tree_without_parents(code):
+    global c, d
+    tree = ast.parse(code)
+
+    json_tree = []
+
+    def gen_identifier(identifier, node_type='identifier'):
+        pos = len(json_tree)
+        json_node = {}
+        json_tree.append(json_node)
+        json_node['type'] = node_type
+        json_node['value'] = identifier
+        return pos
+
+    def traverse_list(l, node_type='list'):
+        pos = len(json_tree)
+        json_node = {}
+        json_tree.append(json_node)
+        json_node['type'] = node_type
+        children = []
+        for item in l:
+            children.append(traverse(item))
+        if (len(children) != 0):
+            json_node['children'] = children
+        return pos
+
+    def traverse(node):
+        pos = len(json_tree)
+        json_node = {}
+        json_tree.append(json_node)
+        json_node['type'] = type(node).__name__
+        children = []
+        if isinstance(node, ast.Name):
+            json_node['value'] = node.id
+        elif isinstance(node, ast.Num):
+            json_node['value'] = unicode(node.n)
+        elif isinstance(node, ast.Str):
+            json_node['value'] = node.s
+        elif isinstance(node, ast.alias):
+            json_node['value'] = unicode(node.name)
+            if node.asname:
+                children.append(gen_identifier(node.asname))
+        elif isinstance(node, ast.FunctionDef):
+            json_node['value'] = unicode(node.name)
+        elif isinstance(node, ast.ClassDef):
+            json_node['value'] = unicode(node.name)
+        elif isinstance(node, ast.ImportFrom):
+            if node.module:
+                json_node['value'] = unicode(node.module)
+        elif isinstance(node, ast.Global):
+            for n in node.names:
+                children.append(gen_identifier(n))
+        elif isinstance(node, ast.keyword):
+            json_node['value'] = unicode(node.arg)
+
+        # Process children.
+        if isinstance(node, ast.For):
+            children.append(traverse(node.target))
+            children.append(traverse(node.iter))
+            children.append(traverse_list(node.body, 'body'))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse'))
+        elif isinstance(node, ast.If) or isinstance(node, ast.While):
+            children.append(traverse(node.test))
+            children.append(traverse_list(node.body, 'body'))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse'))
+        elif isinstance(node, ast.With):
+            children.append(traverse(node.context_expr))
+            if node.optional_vars:
+                children.append(traverse(node.optional_vars))
+            children.append(traverse_list(node.body, 'body'))
+        elif isinstance(node, ast.Try):
+            children.append(traverse_list(node.body, 'body'))
+            children.append(traverse_list(node.handlers, 'handlers'))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse'))
+            if node.finalbody:
+                children.append(traverse_list(node.finalbody, 'finalbody'))
+        elif isinstance(node, ast.arguments):
+            children.append(traverse_list(node.args, 'args'))
+            children.append(traverse_list(node.defaults, 'defaults'))
+            if node.vararg:
+                children.append(gen_identifier(node.vararg, 'vararg'))
+            if node.kwarg:
+                children.append(gen_identifier(node.kwarg, 'kwarg'))
+        elif isinstance(node, ast.ExceptHandler):
+            if node.type:
+                children.append(traverse_list([node.type], 'type'))
+            if node.name:
+                children.append(traverse_list([node.name], 'name'))
+            children.append(traverse_list(node.body, 'body'))
+        elif isinstance(node, ast.ClassDef):
+            children.append(traverse_list(node.bases, 'bases'))
+            children.append(traverse_list(node.body, 'body'))
+            children.append(traverse_list(node.decorator_list, 'decorator_list'))
+        elif isinstance(node, ast.FunctionDef):
+            children.append(traverse(node.args))
+            children.append(traverse_list(node.body, 'body'))
+            children.append(traverse_list(node.decorator_list, 'decorator_list'))
+        else:
+            # Default handling: iterate over children.
+            for child in ast.iter_child_nodes(node):
+                if isinstance(child, ast.expr_context) or isinstance(child, ast.operator) or isinstance(child,
+                                                                                                        ast.boolop) or isinstance(
+                        child, ast.unaryop) or isinstance(child, ast.cmpop):
+                    # Directly include expr_context, and operators into the type instead of creating a child.
+                    json_node['type'] = json_node['type'] + type(child).__name__
+                else:
+                    children.append(traverse(child))
+
+        if isinstance(node, ast.Attribute):
+            children.append(gen_identifier(node.attr, 'attr'))
+
+        if (len(children) != 0):
+            json_node['children'] = children
+        return pos
+
+    traverse(tree)
+    return json_tree
+
+
+def get_docstring(node, clean=True):
+    """
+    Return the docstring for the given node or None if no docstring can
+    be found.  If the node provided does not have docstrings a TypeError
+    will be raised.
+
+    If *clean* is `True`, all tabs are expanded to spaces and any whitespace
+    that can be uniformly removed from the second line onwards is removed.
+    """
+    if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Module)):
+        raise TypeError("%r can't have docstrings" % node.__class__.__name__)
+    if not(node.body and isinstance(node.body[0], ast.Expr)):
+        return None
+    node = node.body[0].value
+    if isinstance(node, ast.Str):
+        text = node.s
+    # elif isinstance(node, Constant) and isinstance(node.value, str):
+    #     text = node.value
+    else:
+        return None
+    if clean:
+        import inspect
+        text = inspect.cleandoc(text)
+    return text
+
+
+def dfs_traversal_with_parents(tree):
+    global c, d
+
+    docstring = ''
+    json_tree = []
+
+    def gen_identifier(identifier, node_type='identifier', parent=None):
+        global docstring
+        pos = len(json_tree)
+        json_node = {}
+        json_tree.append(json_node)
+        json_node['type'] = node_type
+        json_node['value'] = identifier
+
+        if parent:
+            if hasattr(parent, 'ctx'):
+                json_node['parent'] = type(parent).__name__+ type(parent.ctx).__name__
+            else:
+                json_node['parent'] = type(parent).__name__
+        else:
+            json_node['parent'] = None
+        return pos
+
+    def traverse_list(l, node_type='list', parent=None):
+        pos = len(json_tree)
+        json_node = {}
+        json_tree.append(json_node)
+        json_node['type'] = node_type
+        if parent:
+            if hasattr(parent, 'ctx'):
+                json_node['parent'] = type(parent).__name__ + type(parent.ctx).__name__
+            else:
+                json_node['parent'] = type(parent).__name__
+        else:
+            json_node['parent'] = None
+        children = []
+        for item in l:
+            if item:
+                children.append(traverse(item, node_type))
+        if (len(children) != 0):
+            json_node['children'] = children
+        return pos
+
+    def traverse(node, parent=None):
+        global docstring
+        pos = len(json_tree)
+        if not (isinstance(node, ast.Str) and docstring == node.s):
+            json_node = {}
+            json_tree.append(json_node)
+            json_node['type'] = type(node).__name__
+            if parent:
+                if type(parent) == str:
+                    json_node['parent'] = parent
+                elif hasattr(parent, 'ctx'):
+                    json_node['parent'] = type(parent).__name__ + type(parent.ctx).__name__
+                else:
+                    json_node['parent'] = type(parent).__name__
+            else:
+                json_node['parent'] = None
+        children = []
+        if isinstance(node, ast.Name):
+            json_node['value'] = node.id
+        elif isinstance(node, ast.Num):
+            json_node['value'] = unicode(node.n)
+        elif isinstance(node, ast.Str):
+            if docstring != node.s:
+                json_node['value'] = node.s
+        elif isinstance(node, ast.alias):
+            json_node['value'] = unicode(node.name)
+            if node.asname:
+                json_node['value'] = unicode(node.name) + " as " + str(node.asname)
+                # children.append(gen_identifier(node.asname, 'asname', node))
+        elif isinstance(node, ast.FunctionDef):
+            docstring = get_docstring(node, clean=False)
+            json_node['value'] = unicode(node.name)
+        elif isinstance(node, ast.ClassDef):
+            json_node['value'] = unicode(node.name)
+        elif isinstance(node, ast.ImportFrom):
+            if node.module:
+                json_node['value'] = unicode(node.module)
+            # if node.names:
+            #     children.append(traverse_list(node.names, 'imports', node))
+
+        elif isinstance(node, ast.Global):
+            for n in node.names:
+                children.append(gen_identifier(n, 'name', node))
+        elif isinstance(node, ast.keyword):
+            json_node['value'] = unicode(node.arg)
+        elif isinstance(node, ast.arg):
+            json_node['value'] = unicode(node.arg)
+
+        # Process children.
+        if isinstance(node, ast.For):
+            children.append(traverse(node.target, node))
+            children.append(traverse(node.iter, node))
+            children.append(traverse_list(node.body, 'body', node))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse', node))
+        elif isinstance(node, ast.If) or isinstance(node, ast.While):
+            children.append(traverse(node.test, node))
+            children.append(traverse_list(node.body, 'body', node))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse', node))
+        elif isinstance(node, ast.With):
+            for item in node.items:
+                children.append(traverse(item.context_expr, node))
+                if item.optional_vars:
+                    children.append(traverse(item.optional_vars, node))
+            children.append(traverse_list(node.body, 'body', node))
+        elif isinstance(node, ast.Try):
+            children.append(traverse_list(node.body, 'body', node))
+            children.append(traverse_list(node.handlers, 'handlers', node))
+            if node.orelse:
+                children.append(traverse_list(node.orelse, 'orelse', node))
+            if node.finalbody:
+                children.append(traverse_list(node.finalbody, 'finalbody', node))
+        elif isinstance(node, ast.arguments):
+            if node.args:
+                children.append(traverse_list(node.args, 'args', node))
+            if node.defaults:
+                children.append(traverse_list(node.defaults, 'defaults', node))
+            if node.vararg:
+                children.append(gen_identifier(node.vararg.arg, 'vararg', node))
+            if node.kwarg:
+                children.append(gen_identifier(node.kwarg.arg, 'kwarg', node))
+            if node.kwonlyargs:
+                children.append(traverse_list(node.kwonlyargs, 'kwonlyargs', node))
+            if node.kw_defaults:
+                children.append(traverse_list(node.kw_defaults, 'kw_defaults', node))
+
+        elif isinstance(node, ast.ExceptHandler):
+            if node.type:
+                children.append(traverse(node.type))
+            # if node.name:
+            #         children.append(traverse(node.name))
+            children.append(traverse_list(node.body, 'body', node))
+        elif isinstance(node, ast.ClassDef):
+            children.append(traverse_list(node.bases, 'bases', node))
+            children.append(traverse_list(node.body, 'body', node))
+            children.append(traverse_list(node.decorator_list, 'decorator_list', node))
+        elif isinstance(node, ast.FunctionDef):
+            children.append(traverse(node.args, node))
+            children.append(traverse_list(node.body, 'body', node))
+            if node.decorator_list:
+                children.append(traverse_list(node.decorator_list, 'decorator_list', node))
+        else:
+            # Default handling: iterate over children.
+            for child in ast.iter_child_nodes(node):
+                if isinstance(child, ast.expr_context) or isinstance(child, ast.operator) or isinstance(child,
+                                                                                                        ast.boolop) or isinstance(
+                        child, ast.unaryop) or isinstance(child, ast.cmpop):
+                    # Directly include expr_context, and operators into the type instead of creating a child.
+                    json_node['type'] = json_node['type'] + type(child).__name__
+                else:
+                    children.append(traverse(child, node))
+
+        if isinstance(node, ast.Attribute):
+            children.append(gen_identifier(node.attr, 'attribute', node))
+
+        if (len(children) != 0):
+            json_node['children'] = children
+        return pos
+
+    traverse(tree)
+
+    dfs_list = []
+    parent_dfs = []
+    for node in json_tree:
+        parent_dfs.append(node['parent'])
+        dfs_list.append(node['type'])
+        value = node.get('value', None)
+        if value:
+            dfs_list.append(value)
+            parent_dfs.append(node['type'])
+
+    # df = pd.DataFrame([dfs_list, parent_dfs])
+    # print(df.T)
+
+    # pprint(json_tree)
+
+    return dfs_list, parent_dfs
+    # return json_tree
+
diff --git a/src/encoders/masked_seq_encoder.py b/src/encoders/masked_seq_encoder.py
index 0d696e6b..66f8e5e0 100755
--- a/src/encoders/masked_seq_encoder.py
+++ b/src/encoders/masked_seq_encoder.py
@@ -29,12 +29,19 @@ def _make_placeholders(self):
                            shape=[None, self.get_hyper('max_num_tokens')],
                            name='tokens_mask')
 
-    def init_minibatch(self, batch_data: Dict[str, Any]) -> None:
+    def init_minibatch(self, batch_data: Dict[str, Any], code=True) -> None:
         super().init_minibatch(batch_data)
         batch_data['tokens'] = []
         batch_data['tokens_mask'] = []
+        if self.hyperparameters['use_parent'] and code:
+            batch_data['parent_tokens'] = []
+            batch_data['parent_tokens_mask'] = []
 
     def minibatch_to_feed_dict(self, batch_data: Dict[str, Any], feed_dict: Dict[tf.Tensor, Any], is_train: bool) -> None:
         super().minibatch_to_feed_dict(batch_data, feed_dict, is_train)
         write_to_feed_dict(feed_dict, self.placeholders['tokens'], batch_data['tokens'])
         write_to_feed_dict(feed_dict, self.placeholders['tokens_mask'], batch_data['tokens_mask'])
+
+        if self.hyperparameters['use_parent'] and batch_data.get('parent_tokens', None):
+            write_to_feed_dict(feed_dict, self.placeholders['parent_tokens'], batch_data['parent_tokens'])
+            write_to_feed_dict(feed_dict, self.placeholders['parent_tokens_mask'], batch_data['parent_tokens_mask'])
diff --git a/src/encoders/self_att_encoder.py b/src/encoders/self_att_encoder.py
index 363dc1b0..338742ce 100755
--- a/src/encoders/self_att_encoder.py
+++ b/src/encoders/self_att_encoder.py
@@ -15,7 +15,7 @@ def get_default_hyperparameters(cls) -> Dict[str, Any]:
                           'self_attention_intermediate_size': 512,
                           'self_attention_num_layers': 3,
                           'self_attention_num_heads': 8,
-                          'self_attention_pool_mode': 'weighted_mean',
+                          'self_attention_pool_mode': 'weighted_mean'
                           }
         hypers = super().get_default_hyperparameters()
         hypers.update(encoder_hypers)
@@ -32,6 +32,19 @@ def make_model(self, is_train: bool = False) -> tf.Tensor:
         with tf.variable_scope("self_attention_encoder"):
             self._make_placeholders()
 
+            if self.label == "code" and self.hyperparameters['use_parent']:
+                print("USING PARENT NODE CONNECTIONS")
+                self.placeholders['parent_tokens']  = tf.placeholder(tf.int32,
+                               shape=[None, self.get_hyper('max_num_tokens')],
+                               name='parent_tokens')
+
+                self.placeholders['parent_tokens_mask']  = tf.placeholder(tf.int32,
+                               shape=[None, self.get_hyper('max_num_tokens')],
+                               name='parent_tokens_mask')
+            else:
+                self.placeholders['parent_tokens'] = None
+                self.placeholders['parent_tokens_mask'] = None
+
             config = BertConfig(vocab_size=self.get_hyper('token_vocab_size'),
                                 hidden_size=self.get_hyper('self_attention_hidden_size'),
                                 num_hidden_layers=self.get_hyper('self_attention_num_layers'),
@@ -40,9 +53,11 @@ def make_model(self, is_train: bool = False) -> tf.Tensor:
 
             model = BertModel(config=config,
                               is_training=is_train,
-                              input_ids=self.placeholders['tokens'],
+                              input_ids= self.placeholders['tokens'],
                               input_mask=self.placeholders['tokens_mask'],
-                              use_one_hot_embeddings=False)
+                              use_one_hot_embeddings=False,
+                              parent_ids=self.placeholders['parent_tokens'],
+                              parent_mask=self.placeholders['parent_tokens_mask'])
 
             output_pool_mode = self.get_hyper('self_attention_pool_mode').lower()
             if output_pool_mode == 'bert':
diff --git a/src/encoders/seq_encoder.py b/src/encoders/seq_encoder.py
index 79f96e2c..576721da 100755
--- a/src/encoders/seq_encoder.py
+++ b/src/encoders/seq_encoder.py
@@ -128,7 +128,8 @@ def load_data_from_sample(cls,
                               data_to_load: Any,
                               function_name: Optional[str],
                               result_holder: Dict[str, Any],
-                              is_test: bool = True) -> bool:
+                              is_test: bool = True,
+                              parent_tokens=False) -> bool:
         """
         Saves two versions of both the code and the query: one using the docstring as the query and the other using the
         function-name as the query, and replacing the function name in the code with an out-of-vocab token.
@@ -168,6 +169,17 @@ def load_data_from_sample(cls,
             result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask
             result_holder[f'{encoder_label}_tokens_length_{key}'] = int(np.sum(tokens_mask))
 
+            if parent_tokens:
+                parent_tokens = [Vocabulary.get_unk() if token==None else token for token in parent_tokens]
+                tokens, tokens_mask = \
+                    convert_and_pad_token_sequence(metadata['token_vocab'], list(parent_tokens),
+                                                   hyperparameters[f'{encoder_label}_max_num_tokens'])
+                # Note that we share the result_holder with different encoders, and so we need to make our identifiers
+                # unique-ish
+                result_holder[f'{encoder_label}_parent_tokens_{key}'] = tokens
+                result_holder[f'{encoder_label}_parent_tokens_mask_{key}'] = tokens_mask
+                result_holder[f'{encoder_label}_parent_tokens_length_{key}'] = int(np.sum(tokens_mask))
+
         if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \
                 int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0:
             return False
@@ -187,6 +199,11 @@ def extend_minibatch_by_sample(self, batch_data: Dict[str, Any], sample: Dict[st
         current_sample['tokens_mask'] = sample[f'{self.label}_tokens_mask_{query_type}']
         current_sample['tokens_lengths'] = sample[f'{self.label}_tokens_length_{query_type}']
 
+        if self.label == 'code':
+            current_sample['parent_tokens'] = sample[f'{self.label}_parent_tokens_{query_type}']
+            current_sample['parent_tokens_mask'] = sample[f'{self.label}_parent_tokens_mask_{query_type}']
+            current_sample['parent_tokens_lengths'] = sample[f'{self.label}_parent_tokens_length_{query_type}']
+
         # In the query, randomly add high-frequency tokens:
         # TODO: Add tokens with frequency proportional to their frequency in the vocabulary
         if is_train and self.label == 'query' and self.hyperparameters['query_random_token_frequency'] > 0.:
diff --git a/src/encoders/utils/bert_self_attention.py b/src/encoders/utils/bert_self_attention.py
index 76918e95..ecbbb066 100755
--- a/src/encoders/utils/bert_self_attention.py
+++ b/src/encoders/utils/bert_self_attention.py
@@ -18,6 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
+import tensorflow.contrib.eager as tfe
+
+
 import collections
 import copy
 import json
@@ -137,7 +140,9 @@ def __init__(self,
                token_type_ids=None,
                use_one_hot_embeddings=True,
                scope=None,
-               embedded_input=None):
+               embedded_input=None,
+               parent_ids=None,
+               parent_mask=None):
     """Constructor for BertModel.
 
     Args:
@@ -160,6 +165,7 @@ def __init__(self,
       ValueError: The config is invalid or one of the input tensor shapes
         is invalid.
     """
+
     config = copy.deepcopy(config)
     if not is_training:
       config.hidden_dropout_prob = 0.0
@@ -179,6 +185,7 @@ def __init__(self,
       with tf.variable_scope("embeddings"):
         if embedded_input is None:
           # Perform embedding lookup on the word ids.
+          #returns a vector of B x SeqLength x hidden_size
           (self.embedding_output, self.embedding_table) = embedding_lookup(
               input_ids=input_ids,
               vocab_size=config.vocab_size,
@@ -186,6 +193,7 @@ def __init__(self,
               initializer_range=config.initializer_range,
               word_embedding_name="word_embeddings",
               use_one_hot_embeddings=use_one_hot_embeddings)
+
         else:
           self.embedding_output = embedded_input
 
@@ -212,18 +220,69 @@ def __init__(self,
 
         # Run the stacked transformer.
         # `sequence_output` shape = [batch_size, seq_length, hidden_size].
-        self.all_encoder_layers = transformer_model(
-            input_tensor=self.embedding_output,
-            attention_mask=attention_mask,
-            hidden_size=config.hidden_size,
-            num_hidden_layers=config.num_hidden_layers,
-            num_attention_heads=config.num_attention_heads,
-            intermediate_size=config.intermediate_size,
-            intermediate_act_fn=get_activation(config.hidden_act),
-            hidden_dropout_prob=config.hidden_dropout_prob,
-            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-            initializer_range=config.initializer_range,
-            do_return_all_layers=True)
+
+        if parent_ids is not None:
+
+            if parent_mask is None:
+                parent_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+            parent_attention_mask = create_attention_mask_from_input_mask(
+                    parent_ids, parent_mask)
+
+            identity = tf.eye(seq_length)
+            identity = tf.reshape(identity, [1, seq_length, seq_length])
+
+            parent_attention_mask = tf.tile(identity, [batch_size,1,1])
+
+            parent_embedding_output, parent_embedding_table = embedding_lookup(
+                input_ids=parent_ids,
+                vocab_size=config.vocab_size,
+                embedding_size=config.hidden_size,
+                initializer_range=config.initializer_range,
+                word_embedding_name="word_embeddings",
+                use_one_hot_embeddings=use_one_hot_embeddings)
+
+            parent_embedding_output = embedding_postprocessor(
+                input_tensor=parent_embedding_output,
+                use_token_type=True,
+                token_type_ids=token_type_ids,
+                token_type_vocab_size=config.type_vocab_size,
+                token_type_embedding_name="token_type_embeddings",
+                use_position_embeddings=True,
+                position_embedding_name="position_embeddings",
+                initializer_range=config.initializer_range,
+                max_position_embeddings=config.max_position_embeddings,
+                dropout_prob=config.hidden_dropout_prob)
+
+
+
+            self.all_encoder_layers = transformer_model(
+                input_tensor=self.embedding_output,
+                attention_mask=attention_mask,
+                hidden_size=config.hidden_size,
+                num_hidden_layers=config.num_hidden_layers,
+                num_attention_heads=config.num_attention_heads,
+                intermediate_size=config.intermediate_size,
+                intermediate_act_fn=get_activation(config.hidden_act),
+                hidden_dropout_prob=config.hidden_dropout_prob,
+                attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                initializer_range=config.initializer_range,
+                do_return_all_layers=True,
+                parent_tensor=parent_embedding_output,
+                parent_attention_mask=parent_attention_mask)
+        else:
+            self.all_encoder_layers = transformer_model(
+                input_tensor=self.embedding_output,
+                attention_mask=attention_mask,
+                hidden_size=config.hidden_size,
+                num_hidden_layers=config.num_hidden_layers,
+                num_attention_heads=config.num_attention_heads,
+                intermediate_size=config.intermediate_size,
+                intermediate_act_fn=get_activation(config.hidden_act),
+                hidden_dropout_prob=config.hidden_dropout_prob,
+                attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                initializer_range=config.initializer_range,
+                do_return_all_layers=True)
 
       self.sequence_output = self.all_encoder_layers[-1]
       # The "pooler" converts the encoded sequence tensor of shape
@@ -519,7 +578,6 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask):
 
   return mask
 
-
 def attention_layer(from_tensor,
                     to_tensor,
                     attention_mask=None,
@@ -664,10 +722,20 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
   # attention scores.
   # `attention_scores` = [B, N, F, T]
   attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+  # if num_attention_heads == 1:
+  #     attention_scores = tf.Print(attention_scores, [attention_scores[0][0][i] for i in range(10)],
+  #                               "ATTENTION MASK: unscaled attention scores\n", summarize=10)
+  #     query_layer = tf.Print(query_layer, [query_layer[0][0][i] for i in range(10)],
+  #                               "ATTENTION MASK: query\n", summarize=10)
+  #     key_layer = tf.Print(key_layer, [key_layer[0][0][i] for i in range(10)],
+  #                               "ATTENTION MASK: key\n", summarize=10)
   attention_scores = tf.multiply(attention_scores,
                                  1.0 / math.sqrt(float(size_per_head)))
 
   if attention_mask is not None:
+    # if num_attention_heads==1:
+    #     attention_mask = tf.Print(attention_mask, [attention_mask[0][i] for i in range(10)], "ATTENTION MASK: original_attention_masks\n", summarize=10)
+
     # `attention_mask` = [B, 1, F, T]
     attention_mask = tf.expand_dims(attention_mask, axis=[1])
 
@@ -676,17 +744,28 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
     # positions we want to attend and -10000.0 for masked positions.
     adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
 
+    # if num_attention_heads==1:
+    #   adder = tf.Print(adder, [adder[0][0][i] for i in range(10)], "ATTENTION MASK: adders\n", summarize=10)
+    #   attention_scores = tf.Print(attention_scores, [attention_scores[0][0][i] for i in range(10)], "ATTENTION MASK: raw attention_scores\n", summarize=10)
+
     # Since we are adding it to the raw scores before the softmax, this is
     # effectively the same as removing these entirely.
     attention_scores += adder
+    # if num_attention_heads==1:
+    #   attention_scores = tf.Print(attention_scores, [attention_scores[0][0][i] for i in range(10)], "ATTENTION MASK: added attention_scores\n", summarize=10)
 
   # Normalize the attention scores to probabilities.
   # `attention_probs` = [B, N, F, T]
   attention_probs = tf.nn.softmax(attention_scores)
 
+  # if num_attention_heads == 1:
+  #   attention_probs = tf.Print(attention_probs, [attention_probs[0][0][i] for i in range(10)], "ATTENTION MASK:parent attention probs\n", summarize=10)
+  # else:
+  #   attention_probs = tf.Print(attention_probs, [attention_probs[0][0][i] for i in range(10)],
+  #                                "ATTENTION MASK: normal attention probs\n", summarize=10)
   # This is actually dropping out entire tokens to attend to, which might
   # seem a bit unusual, but is taken from the original Transformer paper.
-  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+  # attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
 
   # `value_layer` = [B, T, N, H]
   value_layer = tf.reshape(
@@ -726,7 +805,9 @@ def transformer_model(input_tensor,
                       hidden_dropout_prob=0.1,
                       attention_probs_dropout_prob=0.1,
                       initializer_range=0.02,
-                      do_return_all_layers=False):
+                      do_return_all_layers=False,
+                      parent_tensor=None,
+                      parent_attention_mask=None):
   """Multi-headed, multi-layer Transformer from "Attention is All You Need".
 
   This is almost an exact implementation of the original Transformer encoder.
@@ -786,6 +867,7 @@ def transformer_model(input_tensor,
   # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
   # help the optimizer.
   prev_output = reshape_to_matrix(input_tensor)
+  old_attention_head = num_attention_heads
 
   all_layer_outputs = []
   for layer_idx in range(num_hidden_layers):
@@ -794,6 +876,37 @@ def transformer_model(input_tensor,
 
       with tf.variable_scope("attention"):
         attention_heads = []
+        num_attention_heads = old_attention_head
+        if parent_tensor is not None:
+            with tf.variable_scope('parent'):
+                attention_head_size = int(hidden_size / num_attention_heads)
+                parent_shape = get_shape_list(parent_tensor, expected_rank=3)
+                parent_input_width = parent_shape[2]
+
+                if parent_input_width != hidden_size:
+                    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
+                                     (parent_input_width, hidden_size))
+
+                parent_reshaped_tensor = reshape_to_matrix(parent_tensor)
+
+                attention_head = attention_layer(
+                    from_tensor=layer_input,
+                    to_tensor=parent_reshaped_tensor,
+                    attention_mask=parent_attention_mask,
+                    num_attention_heads=1,
+                    size_per_head=attention_head_size,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    initializer_range=initializer_range,
+                    do_return_2d_tensor=True,
+                    batch_size=batch_size,
+                    from_seq_length=seq_length,
+                    to_seq_length=seq_length)
+
+                attention_heads.append(attention_head)
+                old_attention_head = num_attention_heads
+                num_attention_heads = num_attention_heads-1
+                parent_tensor = None
+
         with tf.variable_scope("self"):
           attention_head = attention_layer(
               from_tensor=layer_input,
@@ -948,4 +1061,74 @@ def assert_rank(tensor, expected_rank, name=None):
     raise ValueError(
         "For the tensor `%s` in scope `%s`, the actual rank "
         "`%d` (shape = %s) is not equal to the expected rank `%s`" %
-        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
\ No newline at end of file
+        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+
+
+if __name__=='__main__':
+    print('something')
+
+    x = tf.placeholder(tf.float32, shape=[None, 4])
+    y = tf.placeholder(tf.float32, shape=[None, 4])
+
+    # test = tf.layers.dense(
+    #     x,
+    #     2 * 3,
+    #     activation=None,
+    #     kernel_initializer=create_initializer(0.02))
+
+
+    from_tensor = x
+    to_tensor = y
+    context = attention_layer(from_tensor, to_tensor, num_attention_heads=1, size_per_head=4, batch_size=1, from_seq_length=2, to_seq_length=2)
+    # transformer = transformer_model(from_tensor,
+    #                   attention_mask=None,
+    #                   hidden_size=4,
+    #                   num_hidden_layers=1,
+    #                   num_attention_heads=1,
+    #                   intermediate_size=4,
+    #                   intermediate_act_fn=get_activation('gelu'),
+    #                   hidden_dropout_prob=0.1,
+    #                   attention_probs_dropout_prob=0.1,
+    #                   initializer_range=0.02,
+    #                   do_return_all_layers=False)
+    #
+    # input_ids = tf.constant([[31, 51, 99, 100], [15, 5, 0, 200]])
+    # input_mask = tf.constant([[1, 1, 1, 1], [1, 1, 0, 1]])
+    # token_type_ids = tf.constant([[0, 0, 1, 1], [0, 2, 0, 1]])
+
+    # config = BertConfig(vocab_size=4, hidden_size=1,
+    #                              num_hidden_layers=1, num_attention_heads=1, intermediate_size=4)
+    #
+    # model = BertModel(config=config, is_training=True,
+    #                            input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+    #
+    # return_value = model.get_pooled_output()
+
+    '''python
+    # Already been converted into WordPiece token ids
+    input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+    input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+
+    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+                                 num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+    model = modeling.BertModel(config=config, is_training=True,
+                               input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+
+    label_embeddings = tf.get_variable(...)
+    pooled_output = model.get_pooled_output()
+    logits = tf.matmul(pooled_output, label_embeddings)'''
+
+
+    sess = tf.InteractiveSession()
+
+    tf.global_variables_initializer().run()
+
+    # print(print('Loss(x,y) = {}'.format(sess.run([model], {x: [["t","2","a","a"], ["a","b","c","d"]]}))))
+
+    print(print('Loss(x,y) = {}'.format(sess.run([context], {x:[[1,2,3,4], [11,12,13,19]], y:[[1,2,3,4], [11,12,13,19]]}))))
+
+    print(print('Loss(x,y) = {}'.format(sess.run([context], {x:[[1,2,3,4], [11,12,13,19]], y:[[6,7,8,9], [20,21,22,23]]}))))
\ No newline at end of file
diff --git a/src/gpurequirements.tx b/src/gpurequirements.tx
index 8b19cbd1..b056805e 100644
--- a/src/gpurequirements.tx
+++ b/src/gpurequirements.tx
@@ -1,3 +1,4 @@
+2to3==1.0
 absl-py==0.9.0
 altair==3.2.0
 annoy==1.16.0
diff --git a/src/models/model.py b/src/models/model.py
index 2420a19d..626fdef4 100755
--- a/src/models/model.py
+++ b/src/models/model.py
@@ -7,6 +7,7 @@
 from collections import defaultdict, OrderedDict
 from enum import Enum, auto
 from typing import List, Dict, Any, Iterable, Tuple, Optional, Union, Callable, Type, DefaultDict
+from tensorflow.python import debug as tf_debug
 
 import numpy as np
 import wandb
@@ -19,7 +20,7 @@
 
 LoadedSamples = Dict[str, List[Dict[str, Any]]]
 SampleId = Tuple[str, int]
-
+from pprint import pprint
 
 class RepresentationType(Enum):
     CODE = auto()
@@ -62,7 +63,8 @@ def parse_data_file(hyperparameters: Dict[str, Any],
                                                                  raw_sample['code_tokens'],
                                                                  function_name,
                                                                  sample,
-                                                                 is_test)
+                                                                 is_test,
+                                                                 raw_sample['parent_dfs'])
 
         use_query_flag = query_encoder_class.load_data_from_sample("query",
                                                                    hyperparameters,
@@ -71,6 +73,7 @@ def parse_data_file(hyperparameters: Dict[str, Any],
                                                                    function_name,
                                                                    sample,
                                                                    is_test)
+
         use_example = use_code_flag and use_query_flag
         results[language].append((use_example, sample))
     return results
@@ -152,7 +155,7 @@ def __init__(self,
 
         graph = tf.Graph()
         self.__sess = tf.Session(graph=graph, config=config)
-
+        
         # save directory as tensorboard.
         self.__tensorboard_dir = log_save_dir
 
@@ -505,7 +508,7 @@ def __init_minibatch(self) -> Dict[str, Any]:
         for (language, language_encoder) in self.__code_encoders.items():
             batch_data['per_language_query_data'][language] = {}
             batch_data['per_language_query_data'][language]['query_sample_ids'] = []
-            self.__query_encoder.init_minibatch(batch_data['per_language_query_data'][language])
+            self.__query_encoder.init_minibatch(batch_data['per_language_query_data'][language], code=False)
             batch_data['per_language_code_data'][language] = {}
             batch_data['per_language_code_data'][language]['code_sample_ids'] = []
             language_encoder.init_minibatch(batch_data['per_language_code_data'][language])
@@ -723,6 +726,8 @@ def __run_epoch_in_batches(self, data: LoadedSamples, epoch_name: str, is_train:
             ops_to_run = {'loss': self.__ops['loss'], 'mrr': self.__ops['mrr']}
             if is_train:
                 ops_to_run['train_step'] = self.__ops['train_step']
+
+            # print(batch_data_dict)
             op_results = self.__sess.run(ops_to_run, feed_dict=batch_data_dict)
             assert not np.isnan(op_results['loss'])
 
diff --git a/src/models/self_att_model.py b/src/models/self_att_model.py
index c47fa200..269b7cd3 100755
--- a/src/models/self_att_model.py
+++ b/src/models/self_att_model.py
@@ -16,6 +16,7 @@ def get_default_hyperparameters(cls) -> Dict[str, Any]:
             'code_use_subtokens': False,
             'code_mark_subtoken_end': False,
             'batch_size': 450,
+            'use_parent': True
         }
         hypers.update(super().get_default_hyperparameters())
         hypers.update(model_hypers)
diff --git a/src/train.py b/src/train.py
index f0eeaba6..1bbbe28b 100755
--- a/src/train.py
+++ b/src/train.py
@@ -129,16 +129,6 @@ def run(arguments, tag_in_vcs=False) -> None:
     model_class = model_restore_helper.get_model_class_from_name(arguments['--model'])
 
     hyperparameters = model_class.get_default_hyperparameters()
-
-    #CUSTOM HYPERPARAMS
-    batch_size = arguments.get('--batch-size')
-    if batch_size:
-        hyperparameters['batch_size'] = int(batch_size)
-    # hyperparameters['code_use_bpe'] = False
-    # hyperparameters['query_use_bpe'] = False
-
-    print("testing, stuff", hyperparameters['batch_size'])
-
     run_name = make_run_id(arguments)
 
     # make name of wandb run = run_id (Doesn't populate yet)
@@ -197,17 +187,5 @@ def run(arguments, tag_in_vcs=False) -> None:
 
 
 if __name__ == '__main__':
-    print('random test')
     args = docopt(__doc__)
-    args['--model'] = 'selfatt'
-    args['--dryrun'] = True
-    # args['--testrun'] = True
-    args['--sequential'] = True
-    args['--max_epoch'] = 20
-    args['--batch-size'] = 2
-
-    # run_and_debug(lambda: run(args), args['--debug'])
-
-    # args = docopt(__doc__)
-    # args=['--model', 'selfatten', '--testrun']
-    run(args)
+    run_and_debug(lambda: run(args), args['--debug'])