diff --git a/README.md b/README.md
index b70fa81..b33ae25 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,10 @@
+## Python 3 Version of Show, Attend and Tell using Tensorflow
+This repo is python3 version of [DeepRNN/image_captioning](https://github.com/DeepRNN/image_captioning), which implements "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" by Xu et al. (ICML2015). Many thanks to [salaniz's coco evaluation tool for python3](https://github.com/salaniz/pycocoevalcap). I am using
+- Python 3.6
+- Tensorflow 1.8.0
+
+#### Original readme below
+
 ### Introduction
 This neural system for image captioning is roughly based on the paper "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" by Xu et al. (ICML2015). The input is an image, and the output is a sentence describing the content of the image. It uses a convolutional neural network to extract visual features from the image, and uses a LSTM recurrent neural network to decode these features into a sentence. A soft attention mechanism is incorporated to improve the quality of the caption. This project is implemented using the Tensorflow library, and allows end-to-end training of both CNN and RNN parts.
 
diff --git a/base_model.py b/base_model.py
index 9e0c58e..f291b60 100644
--- a/base_model.py
+++ b/base_model.py
@@ -2,8 +2,12 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+
+import matplotlib
+matplotlib.use('agg')
+
 import matplotlib.pyplot as plt
-import cPickle as pickle
+import pickle
 import copy
 import json
 from tqdm import tqdm
@@ -66,6 +70,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
         config = self.config
 
         results = []
+        print('config.eval_result_dir:', config.eval_result_dir)
         if not os.path.exists(config.eval_result_dir):
             os.mkdir(config.eval_result_dir)
 
@@ -81,7 +86,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
                 word_idxs = caption_data[l][0].sentence
                 score = caption_data[l][0].score
                 caption = vocabulary.get_sentence(word_idxs)
-                results.append({'image_id': eval_data.image_ids[idx],
+                results.append({'image_id': eval_data.image_ids[idx].item(),
                                 'caption': caption})
                 idx += 1
 
@@ -97,7 +102,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
                     plt.savefig(os.path.join(config.eval_result_dir,
                                              image_name+'_result.jpg'))
 
-        fp = open(config.eval_result_file, 'wb')
+        fp = open(config.eval_result_file, 'w')
         json.dump(results, fp)
         fp.close()
 
@@ -259,7 +264,7 @@ def load(self, sess, model_file=None):
                                      str(global_step)+".npy")
 
         print("Loading the model from %s..." %save_path)
-        data_dict = np.load(save_path).item()
+        data_dict = np.load(save_path, encoding='latin1').item()
         count = 0
         for v in tqdm(tf.global_variables()):
             if v.name in data_dict.keys():
@@ -270,11 +275,14 @@ def load(self, sess, model_file=None):
     def load_cnn(self, session, data_path, ignore_missing=True):
         """ Load a pretrained CNN model. """
         print("Loading the CNN from %s..." %data_path)
-        data_dict = np.load(data_path).item()
+        # import pdb; pdb.set_trace()
+        import os;
+        data_path = data_path.strip()
+        data_dict = np.load(os.getcwd() + '/' + data_path, encoding='latin1').item()
         count = 0
         for op_name in tqdm(data_dict):
             with tf.variable_scope(op_name, reuse = True):
-                for param_name, data in data_dict[op_name].iteritems():
+                for param_name, data in data_dict[op_name].items():
                     try:
                         var = tf.get_variable(param_name)
                         session.run(var.assign(data))
diff --git a/dataset.py b/dataset.py
index 5f656a0..300e10a 100644
--- a/dataset.py
+++ b/dataset.py
@@ -122,7 +122,7 @@ def prepare_train_data(config):
         data = {'word_idxs': word_idxs, 'masks': masks}
         np.save(config.temp_data_file, data)
     else:
-        data = np.load(config.temp_data_file).item()
+        data = np.load(config.temp_data_file, encoding='latin1').item()
         word_idxs = data['word_idxs']
         masks = data['masks']
     print("Captions processed.")
diff --git a/utils/coco/pycocoevalcap/.gitignore b/utils/coco/pycocoevalcap/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/utils/coco/pycocoevalcap/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/utils/coco/pycocoevalcap/README.md b/utils/coco/pycocoevalcap/README.md
new file mode 100644
index 0000000..7081c81
--- /dev/null
+++ b/utils/coco/pycocoevalcap/README.md
@@ -0,0 +1,44 @@
+Microsoft COCO Caption Evaluation
+===================
+
+Evaluation codes for MS COCO caption generation.
+
+## Description ##
+This repository provides Python 3 support for the caption evaluation metrics used for the MS COCO dataset.
+
+The code is derived from the original repository that supports Python 2.7: https://github.com/tylin/coco-caption.  
+Caption evaluation depends on the COCO API that natively supports Python 3 (see Requirements).
+
+## Requirements ##
+- Java 1.8.0
+- Python 3 (tested on Python 3.6)
+- pycocotools (COCO Python API): https://github.com/cocodataset/cocoapi
+
+## Files ##
+./
+- evals.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
+- tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
+- bleu: Bleu evalutation codes
+- meteor: Meteor evaluation codes
+- rouge: Rouge-L evaluation codes
+- cider: CIDEr evaluation codes
+
+## References ##
+
+- [Microsoft COCO Captions: Data Collection and Evaluation Server](http://arxiv.org/abs/1504.00325)
+- PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml).
+- BLEU: [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)
+- Meteor: [Project page](http://www.cs.cmu.edu/~alavie/METEOR/) with related publications. We use the latest version (1.5) of the [Code](https://github.com/mjdenkowski/meteor). Changes have been made to the source code to properly aggreate the statistics for the entire corpus.
+- Rouge-L: [ROUGE: A Package for Automatic Evaluation of Summaries](http://anthology.aclweb.org/W/W04/W04-1013.pdf)
+- CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf)
+
+## Developers ##
+- Xinlei Chen (CMU)
+- Hao Fang (University of Washington)
+- Tsung-Yi Lin (Cornell)
+- Ramakrishna Vedantam (Virgina Tech)
+
+## Acknowledgement ##
+- David Chiang (University of Norte Dame)
+- Michael Denkowski (CMU)
+- Alexander Rush (Harvard University)
diff --git a/utils/coco/pycocoevalcap/bleu/bleu.py b/utils/coco/pycocoevalcap/bleu/bleu.py
index b0da5dd..bdebbb5 100755
--- a/utils/coco/pycocoevalcap/bleu/bleu.py
+++ b/utils/coco/pycocoevalcap/bleu/bleu.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # File Name : bleu.py
 #
 # Description : Wrapper for BLEU scorer.
@@ -8,7 +8,7 @@
 # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 
-from bleu_scorer import BleuScorer
+from .bleu_scorer import BleuScorer
 
 
 class Bleu:
diff --git a/utils/coco/pycocoevalcap/bleu/bleu_scorer.py b/utils/coco/pycocoevalcap/bleu/bleu_scorer.py
index 3685e05..d240336 100755
--- a/utils/coco/pycocoevalcap/bleu/bleu_scorer.py
+++ b/utils/coco/pycocoevalcap/bleu/bleu_scorer.py
@@ -7,7 +7,7 @@
 # reserved. Do not redistribute without permission from the
 # author. Not for commercial use.
 
-# Modified by: 
+# Modified by:
 # Hao Fang <hfang@uw.edu>
 # Tsung-Yi Lin <tl483@cornell.edu>
 
@@ -26,8 +26,8 @@ def precook(s, n=4, out=False):
     can take string arguments as well."""
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return (len(words), counts)
@@ -42,7 +42,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
     for ref in refs:
         rl, counts = precook(ref, n)
         reflen.append(rl)
-        for (ngram,count) in counts.iteritems():
+        for (ngram,count) in counts.items():
             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
     # Calculate effective reference sentence length.
@@ -52,21 +52,22 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
         reflen = float(sum(reflen))/len(reflen)
 
     ## lhuang: N.B.: leave reflen computaiton to the very end!!
-    
+
     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 
     return (reflen, maxcounts)
 
-def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
+def cook_test(test, refs, eff=None, n=4):
     '''Takes a test sentence and returns an object that
     encapsulates everything that BLEU needs to know about it.'''
 
+    reflen, refmaxcounts = refs
     testlen, counts = precook(test, n, True)
 
     result = {}
 
     # Calculate effective reference sentence length.
-    
+
     if eff == "closest":
         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
     else: ## i.e., "average" or "shortest" or None
@@ -74,10 +75,10 @@ def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
 
     result["testlen"] = testlen
 
-    result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 
     result['correct'] = [0]*n
-    for (ngram, count) in counts.iteritems():
+    for (ngram, count) in counts.items():
         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 
     return result
@@ -108,7 +109,7 @@ def __init__(self, test=None, refs=None, n=4, special_reflen=None):
 
     def cook_append(self, test, refs):
         '''called by constructor and __iadd__ to avoid creating new instances.'''
-        
+
         if refs is not None:
             self.crefs.append(cook_refs(refs))
             if test is not None:
@@ -136,7 +137,7 @@ def reflen(self, option=None):
 
     def testlen(self, option=None):
         self.compute_score(option=option)
-        return self._testlen        
+        return self._testlen
 
     def retest(self, new_test):
         if type(new_test) is str:
@@ -151,7 +152,7 @@ def retest(self, new_test):
 
     def rescore(self, new_test):
         ''' replace test(s) with new test(s), and returns the new score.'''
-        
+
         return self.retest(new_test).compute_score()
 
     def size(self):
@@ -170,7 +171,7 @@ def __iadd__(self, other):
             self.crefs.extend(other.crefs)
             self._score = None ## need to recompute
 
-        return self        
+        return self
 
     def compatible(self, other):
         return isinstance(other, BleuScorer) and self.n == other.n
@@ -179,7 +180,7 @@ def single_reflen(self, option="average"):
         return self._single_reflen(self.crefs[0][0], option)
 
     def _single_reflen(self, reflens, option=None, testlen=None):
-        
+
         if option == "shortest":
             reflen = min(reflens)
         elif option == "average":
@@ -194,7 +195,7 @@ def _single_reflen(self, reflens, option=None, testlen=None):
     def recompute_score(self, option=None, verbose=0):
         self._score = None
         return self.compute_score(option, verbose)
-        
+
     def compute_score(self, option=None, verbose=0):
         n = self.n
         small = 1e-9
@@ -212,7 +213,7 @@ def compute_score(self, option=None, verbose=0):
         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
 
         # for each sentence
-        for comps in self.ctest:            
+        for comps in self.ctest:
             testlen = comps['testlen']
             self._testlen += testlen
 
@@ -222,42 +223,42 @@ def compute_score(self, option=None, verbose=0):
                 reflen = self.special_reflen
 
             self._reflen += reflen
-                
+
             for key in ['guess','correct']:
-                for k in xrange(n):
+                for k in range(n):
                     totalcomps[key][k] += comps[key][k]
 
             # append per image bleu score
             bleu = 1.
-            for k in xrange(n):
+            for k in range(n):
                 bleu *= (float(comps['correct'][k]) + tiny) \
-                        /(float(comps['guess'][k]) + small) 
+                        /(float(comps['guess'][k]) + small)
                 bleu_list[k].append(bleu ** (1./(k+1)))
             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
             if ratio < 1:
-                for k in xrange(n):
+                for k in range(n):
                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
 
             if verbose > 1:
-                print comps, reflen
+                print(comps, reflen)
 
         totalcomps['reflen'] = self._reflen
         totalcomps['testlen'] = self._testlen
 
         bleus = []
         bleu = 1.
-        for k in xrange(n):
+        for k in range(n):
             bleu *= float(totalcomps['correct'][k] + tiny) \
                     / (totalcomps['guess'][k] + small)
             bleus.append(bleu ** (1./(k+1)))
         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
         if ratio < 1:
-            for k in xrange(n):
+            for k in range(n):
                 bleus[k] *= math.exp(1 - 1/ratio)
 
         if verbose > 0:
-            print totalcomps
-            print "ratio:", ratio
+            print(totalcomps)
+            print("ratio:", ratio)
 
         self._score = bleus
         return self._score, bleu_list
diff --git a/utils/coco/pycocoevalcap/cider/cider.py b/utils/coco/pycocoevalcap/cider/cider.py
index d0b99ee..18f6cc4 100755
--- a/utils/coco/pycocoevalcap/cider/cider.py
+++ b/utils/coco/pycocoevalcap/cider/cider.py
@@ -1,18 +1,18 @@
 # Filename: cider.py
 #
-# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
+# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
 #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 #
 # Creation Date: Sun Feb  8 14:16:54 2015
 #
 # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 
-from cider_scorer import CiderScorer
+from .cider_scorer import CiderScorer
 import pdb
 
 class Cider:
     """
-    Main Class to compute the CIDEr metric 
+    Main Class to compute the CIDEr metric
 
     """
     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
@@ -26,7 +26,7 @@ def compute_score(self, gts, res):
         Main function to compute CIDEr score
         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
-        :return: cider (float) : computed CIDEr score for the corpus 
+        :return: cider (float) : computed CIDEr score for the corpus
         """
 
         assert(gts.keys() == res.keys())
@@ -51,4 +51,4 @@ def compute_score(self, gts, res):
         return score, scores
 
     def method(self):
-        return "CIDEr"
\ No newline at end of file
+        return "CIDEr"
diff --git a/utils/coco/pycocoevalcap/cider/cider_scorer.py b/utils/coco/pycocoevalcap/cider/cider_scorer.py
index a73405e..94d6689 100755
--- a/utils/coco/pycocoevalcap/cider/cider_scorer.py
+++ b/utils/coco/pycocoevalcap/cider/cider_scorer.py
@@ -19,8 +19,8 @@ def precook(s, n=4, out=False):
     """
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return counts
@@ -99,7 +99,7 @@ def compute_doc_freq(self):
         '''
         for refs in self.crefs:
             # refs, k ref captions of one image
-            for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
+            for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
                 self.document_frequency[ngram] += 1
             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
@@ -115,7 +115,7 @@ def counts2vec(cnts):
             vec = [defaultdict(float) for _ in range(self.n)]
             length = 0
             norm = [0.0 for _ in range(self.n)]
-            for (ngram,term_freq) in cnts.iteritems():
+            for (ngram,term_freq) in cnts.items():
                 # give word count 1 if it doesn't appear in reference corpus
                 df = np.log(max(1.0, self.document_frequency[ngram]))
                 # ngram index
@@ -146,7 +146,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
             val = np.array([0.0 for _ in range(self.n)])
             for n in range(self.n):
                 # ngram
-                for (ngram,count) in vec_hyp[n].iteritems():
+                for (ngram,count) in vec_hyp[n].items():
                     # vrama91 : added clipping
                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
 
@@ -189,4 +189,4 @@ def compute_score(self, option=None, verbose=0):
         score = self.compute_cider()
         # debug
         # print score
-        return np.mean(np.array(score)), np.array(score)
\ No newline at end of file
+        return np.mean(np.array(score)), np.array(score)
diff --git a/utils/coco/pycocoevalcap/eval.py b/utils/coco/pycocoevalcap/eval.py
index bfbac93..d129222 100755
--- a/utils/coco/pycocoevalcap/eval.py
+++ b/utils/coco/pycocoevalcap/eval.py
@@ -1,9 +1,9 @@
 __author__ = 'tylin'
-from tokenizer.ptbtokenizer import PTBTokenizer
-from bleu.bleu import Bleu
-from meteor.meteor import Meteor
-from rouge.rouge import Rouge
-from cider.cider import Cider
+from .tokenizer.ptbtokenizer import PTBTokenizer
+from .bleu.bleu import Bleu
+from .meteor.meteor import Meteor
+from .rouge.rouge import Rouge
+from .cider.cider import Cider
 
 class COCOEvalCap:
     def __init__(self, coco, cocoRes):
@@ -26,7 +26,7 @@ def evaluate(self):
         # =================================================
         # Set up scorers
         # =================================================
-        print 'tokenization...'
+        print('tokenization...')
         tokenizer = PTBTokenizer()
         gts  = tokenizer.tokenize(gts)
         res = tokenizer.tokenize(res)
@@ -34,7 +34,7 @@ def evaluate(self):
         # =================================================
         # Set up scorers
         # =================================================
-        print 'setting up scorers...'
+        print('setting up scorers...')
         scorers = [
             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
             (Meteor(),"METEOR"),
@@ -46,17 +46,17 @@ def evaluate(self):
         # Compute scores
         # =================================================
         for scorer, method in scorers:
-            print 'computing %s score...'%(scorer.method())
+            print('computing %s score...'%(scorer.method()))
             score, scores = scorer.compute_score(gts, res)
             if type(method) == list:
                 for sc, scs, m in zip(score, scores, method):
                     self.setEval(sc, m)
                     self.setImgToEvalImgs(scs, gts.keys(), m)
-                    print "%s: %0.3f"%(m, sc)
+                    print("%s: %0.3f"%(m, sc))
             else:
                 self.setEval(score, method)
                 self.setImgToEvalImgs(scores, gts.keys(), method)
-                print "%s: %0.3f"%(method, score)
+                print("%s: %0.3f"%(method, score))
         self.setEvalImgs()
 
     def setEval(self, score, method):
@@ -70,4 +70,4 @@ def setImgToEvalImgs(self, scores, imgIds, method):
             self.imgToEval[imgId][method] = score
 
     def setEvalImgs(self):
-        self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
\ No newline at end of file
+        self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
diff --git a/utils/coco/pycocoevalcap/license.txt b/utils/coco/pycocoevalcap/license.txt
new file mode 100644
index 0000000..7428d36
--- /dev/null
+++ b/utils/coco/pycocoevalcap/license.txt
@@ -0,0 +1,26 @@
+Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies,
+either expressed or implied, of the FreeBSD Project.
diff --git a/utils/coco/pycocoevalcap/meteor/meteor.py b/utils/coco/pycocoevalcap/meteor/meteor.py
index 9a61606..61916f4 100755
--- a/utils/coco/pycocoevalcap/meteor/meteor.py
+++ b/utils/coco/pycocoevalcap/meteor/meteor.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 # Python wrapper for METEOR implementation, by Xinlei Chen
-# Acknowledge Michael Denkowski for the generous discussion and help 
+# Acknowledge Michael Denkowski for the generous discussion and help
 
 import os
 import sys
@@ -37,7 +37,8 @@ def compute_score(self, gts, res):
             stat = self._stat(res[i][0], gts[i])
             eval_line += ' ||| {}'.format(stat)
 
-        self.meteor_p.stdin.write('{}\n'.format(eval_line))
+        self.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
+        self.meteor_p.stdin.flush()
         for i in range(0,len(imgIds)):
             scores.append(float(self.meteor_p.stdout.readline().strip()))
         score = float(self.meteor_p.stdout.readline().strip())
@@ -52,8 +53,9 @@ def _stat(self, hypothesis_str, reference_list):
         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
-        self.meteor_p.stdin.write('{}\n'.format(score_line))
-        return self.meteor_p.stdout.readline().strip()
+        self.meteor_p.stdin.write('{}\n'.format(score_line).encode())
+        self.meteor_p.stdin.flush()
+        return self.meteor_p.stdout.readline().decode().strip()
 
     def _score(self, hypothesis_str, reference_list):
         self.lock.acquire()
@@ -63,7 +65,7 @@ def _score(self, hypothesis_str, reference_list):
         self.meteor_p.stdin.write('{}\n'.format(score_line))
         stats = self.meteor_p.stdout.readline().strip()
         eval_line = 'EVAL ||| {}'.format(stats)
-        # EVAL ||| stats 
+        # EVAL ||| stats
         self.meteor_p.stdin.write('{}\n'.format(eval_line))
         score = float(self.meteor_p.stdout.readline().strip())
         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
@@ -71,8 +73,8 @@ def _score(self, hypothesis_str, reference_list):
         score = float(self.meteor_p.stdout.readline().strip())
         self.lock.release()
         return score
- 
-    def __exit__(self):
+
+    def __del__(self):
         self.lock.acquire()
         self.meteor_p.stdin.close()
         self.meteor_p.kill()
diff --git a/utils/coco/pycocoevalcap/rouge/rouge.py b/utils/coco/pycocoevalcap/rouge/rouge.py
index 3a10f5a..842f397 100755
--- a/utils/coco/pycocoevalcap/rouge/rouge.py
+++ b/utils/coco/pycocoevalcap/rouge/rouge.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # File Name : rouge.py
 #
 # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
@@ -49,14 +49,14 @@ def calc_score(self, candidate, refs):
         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
         :returns score: int (ROUGE-L score for the candidate evaluated against references)
         """
-        assert(len(candidate)==1)	
-        assert(len(refs)>0)         
+        assert(len(candidate)==1)
+        assert(len(refs)>0)
         prec = []
         rec = []
 
         # split into tokens
         token_c = candidate[0].split(" ")
-    	
+
         for reference in refs:
             # split into tokens
             token_r = reference.split(" ")
@@ -77,8 +77,8 @@ def calc_score(self, candidate, refs):
     def compute_score(self, gts, res):
         """
         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
-        Invoked by evaluate_captions.py 
-        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
+        Invoked by evaluate_captions.py
+        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
         """
diff --git a/utils/coco/pycocoevalcap/tokenizer/ptbtokenizer.py b/utils/coco/pycocoevalcap/tokenizer/ptbtokenizer.py
index aa790ba..6b7f370 100755
--- a/utils/coco/pycocoevalcap/tokenizer/ptbtokenizer.py
+++ b/utils/coco/pycocoevalcap/tokenizer/ptbtokenizer.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # File Name : ptbtokenizer.py
 #
 # Description : Do the PTB Tokenization and remove punctuations.
@@ -19,7 +19,7 @@
 
 # punctuations to be removed from the sentences
 PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
-        ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
+        ".", "?", "!", ",", ":", "-", "--", "...", ";"]
 
 class PTBTokenizer:
     """Python wrapper of Stanford PTBTokenizer"""
@@ -41,7 +41,7 @@ def tokenize(self, captions_for_image):
         # ======================================================
         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
-        tmp_file.write(sentences)
+        tmp_file.write(sentences.encode())
         tmp_file.close()
 
         # ======================================================
@@ -51,6 +51,7 @@ def tokenize(self, captions_for_image):
         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
                 stdout=subprocess.PIPE)
         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
+        token_lines = token_lines.decode()
         lines = token_lines.split('\n')
         # remove temp file
         os.remove(tmp_file.name)