diff --git a/pom.xml b/pom.xml
index c234a33..bf18bf2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -36,5 +36,11 @@
slf4j-log4j12
1.6.1
+
+ org.jsoup
+ jsoup
+ 1.8.3
+
+
diff --git a/src/main/java/com/datumbox/opensource/classifiers/NaiveBayes.java b/src/main/java/com/datumbox/opensource/classifiers/NaiveBayes.java
new file mode 100644
index 0000000..a4762f0
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/classifiers/NaiveBayes.java
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.classifiers;
+
+import com.datumbox.opensource.dataobjects.Document;
+import com.datumbox.opensource.dataobjects.FeatureStats;
+import com.datumbox.opensource.dataobjects.NaiveBayesKnowledgeBase;
+import com.datumbox.opensource.features.FeatureExtraction;
+import com.datumbox.opensource.features.TextTokenizer;
+
+import java.util.*;
+
+/**
+ * Implements a basic form of Multinomial Naive Bayes Text Classifier as described at
+ * http://blog.datumbox.com/machine-learning-tutorial-the-naive-bayes-text-classifier/
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class NaiveBayes {
+ private double chisquareCriticalValue = 10.83; //equivalent to pvalue 0.001. It is used by feature selection algorithm
+
+ private NaiveBayesKnowledgeBase knowledgeBase;
+
+ /**
+ * This constructor is used when we load an already train classifier
+ *
+ * @param knowledgeBase
+ */
+ public NaiveBayes(NaiveBayesKnowledgeBase knowledgeBase) {
+ this.knowledgeBase = knowledgeBase;
+ }
+
+ /**
+ * This constructor is used when we plan to train a new classifier.
+ */
+ public NaiveBayes() {
+ this(null);
+ }
+
+ /**
+ * Gets the knowledgebase parameter
+ *
+ * @return
+ */
+ public NaiveBayesKnowledgeBase getKnowledgeBase() {
+ return knowledgeBase;
+ }
+
+ /**
+ * Gets the chisquareCriticalValue paramter.
+ *
+ * @return
+ */
+ public double getChisquareCriticalValue() {
+ return chisquareCriticalValue;
+ }
+
+ /**
+ * Sets the chisquareCriticalValue parameter.
+ *
+ * @param chisquareCriticalValue
+ */
+ public void setChisquareCriticalValue(double chisquareCriticalValue) {
+ this.chisquareCriticalValue = chisquareCriticalValue;
+ }
+
+ /**
+ * Preprocesses the original dataset and converts it to a List of Documents.
+ *
+ * @param trainingDataset
+ * @return
+ */
+ private List preprocessDataset(Map trainingDataset) {
+ List dataset = new ArrayList<>();
+
+ String category;
+ String[] examples;
+
+ Document doc;
+
+ Iterator> it = trainingDataset.entrySet().iterator();
+
+ //loop through all the categories and training examples
+ while (it.hasNext()) {
+ Map.Entry entry = it.next();
+ category = entry.getKey();
+ examples = entry.getValue();
+
+ for (int i = 0; i < examples.length; ++i) {
+ //for each example in the category tokenize its text and convert it into a Document object.
+ doc = TextTokenizer.tokenize(examples[i]);
+ doc.category = category;
+ dataset.add(doc);
+
+ //examples[i] = null; //try freeing some memory
+ }
+
+ //it.remove(); //try freeing some memory
+ }
+
+ return dataset;
+ }
+
+ /**
+ * Gathers the required counts for the features and performs feature selection
+ * on the above counts. It returns a FeatureStats object that is later used
+ * for calculating the probabilities of the model.
+ *
+ * @param dataset
+ * @return
+ */
+ private FeatureStats selectFeatures(List dataset) {
+ FeatureExtraction featureExtractor = new FeatureExtraction();
+
+ //the FeatureStats object contains statistics about all the features found in the documents
+ FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset
+
+ //we pass this information to the feature selection algorithm and we get a list with the selected features
+ Map selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue);
+
+ //clip from the stats all the features that are not selected
+ Iterator>> it = stats.featureCategoryJointCount.entrySet().iterator();
+ while (it.hasNext()) {
+ String feature = it.next().getKey();
+
+ if (selectedFeatures.containsKey(feature) == false) {
+ //if the feature is not in the selectedFeatures list remove it
+ it.remove();
+ }
+ }
+
+ return stats;
+ }
+
+ /**
+ * Trains a Naive Bayes classifier by using the Multinomial Model by passing
+ * the trainingDataset and the prior probabilities.
+ *
+ * @param trainingDataset
+ * @param categoryPriors
+ * @throws IllegalArgumentException
+ */
+ public void train(Map trainingDataset, Map categoryPriors) throws IllegalArgumentException {
+ //preprocess the given dataset
+ List dataset = preprocessDataset(trainingDataset);
+
+
+ //produce the feature stats and select the best features
+ FeatureStats featureStats = selectFeatures(dataset);
+
+
+ //intiliaze the knowledgeBase of the classifier
+ knowledgeBase = new NaiveBayesKnowledgeBase();
+ knowledgeBase.n = featureStats.n; //number of observations
+ knowledgeBase.d = featureStats.featureCategoryJointCount.size(); //number of features
+
+
+ //check is prior probabilities are given
+ if (categoryPriors == null) {
+ //if not estimate the priors from the sample
+ knowledgeBase.c = featureStats.categoryCounts.size(); //number of cateogries
+ knowledgeBase.logPriors = new HashMap<>();
+
+ String category;
+ int count;
+ for (Map.Entry entry : featureStats.categoryCounts.entrySet()) {
+ category = entry.getKey();
+ count = entry.getValue();
+
+ knowledgeBase.logPriors.put(category, Math.log((double) count / knowledgeBase.n));
+ }
+ } else {
+ //if they are provided then use the given priors
+ knowledgeBase.c = categoryPriors.size();
+
+ //make sure that the given priors are valid
+ if (knowledgeBase.c != featureStats.categoryCounts.size()) {
+ throw new IllegalArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
+ }
+
+ String category;
+ Double priorProbability;
+ for (Map.Entry entry : categoryPriors.entrySet()) {
+ category = entry.getKey();
+ priorProbability = entry.getValue();
+ if (priorProbability == null) {
+ throw new IllegalArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
+ } else if (priorProbability < 0 || priorProbability > 1) {
+ throw new IllegalArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1.");
+ }
+
+ knowledgeBase.logPriors.put(category, Math.log(priorProbability));
+ }
+ }
+
+ //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category
+ Map featureOccurrencesInCategory = new HashMap<>();
+
+ Integer occurrences;
+ Double featureOccSum;
+ for (String category : knowledgeBase.logPriors.keySet()) {
+ featureOccSum = 0.0;
+ for (Map categoryListOccurrences : featureStats.featureCategoryJointCount.values()) {
+ occurrences = categoryListOccurrences.get(category);
+ if (occurrences != null) {
+ featureOccSum += occurrences;
+ }
+ }
+ featureOccurrencesInCategory.put(category, featureOccSum);
+ }
+
+ //estimate log likelihoods
+ String feature;
+ Integer count;
+ Map featureCategoryCounts;
+ double logLikelihood;
+ for (String category : knowledgeBase.logPriors.keySet()) {
+ for (Map.Entry> entry : featureStats.featureCategoryJointCount.entrySet()) {
+ feature = entry.getKey();
+ featureCategoryCounts = entry.getValue();
+
+ count = featureCategoryCounts.get(category);
+ if (count == null) {
+ count = 0;
+ }
+
+ logLikelihood = Math.log((count + 1.0) / (featureOccurrencesInCategory.get(category) + knowledgeBase.d));
+ if (knowledgeBase.logLikelihoods.containsKey(feature) == false) {
+ knowledgeBase.logLikelihoods.put(feature, new HashMap());
+ }
+ knowledgeBase.logLikelihoods.get(feature).put(category, logLikelihood);
+ }
+ }
+ featureOccurrencesInCategory = null;
+ }
+
+ /**
+ * Wrapper method of train() which enables the estimation of the prior
+ * probabilities based on the sample.
+ *
+ * @param trainingDataset
+ */
+ public void train(Map trainingDataset) {
+ train(trainingDataset, null);
+ }
+
+ /**
+ * Predicts the category of a text by using an already trained classifier
+ * and returns its category.
+ *
+ * @param text
+ * @return
+ * @throws IllegalArgumentException
+ */
+ public String predict(String text) throws IllegalArgumentException {
+ if (knowledgeBase == null) {
+ throw new IllegalArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
+ }
+
+ //Tokenizes the text and creates a new document
+ Document doc = TextTokenizer.tokenize(text);
+
+
+ String category;
+ String feature;
+ Integer occurrences;
+ Double logprob;
+
+ String maxScoreCategory = null;
+ Double maxScore = Double.NEGATIVE_INFINITY;
+ Double minScore = Double.POSITIVE_INFINITY;
+
+ //Map predictionScores = new HashMap<>();
+ Map probs = new HashMap<>();
+ for (Map.Entry entry1 : knowledgeBase.logPriors.entrySet()) {
+ category = entry1.getKey();
+ logprob = entry1.getValue(); //intialize the scores with the priors
+
+ //foreach feature of the document
+ for (Map.Entry entry2 : doc.tokens.entrySet()) {
+ feature = entry2.getKey();
+
+ if (!knowledgeBase.logLikelihoods.containsKey(feature)) {
+ continue; //if the feature does not exist in the knowledge base skip it
+ }
+
+ occurrences = entry2.getValue(); //get its occurrences in text
+
+ logprob += occurrences * knowledgeBase.logLikelihoods.get(feature).get(category); //multiply loglikelihood score with occurrences
+ }
+ //predictionScores.put(category, logprob);
+ probs.put(category, logprob);
+ if (logprob > maxScore) {
+ maxScore = logprob;
+ maxScoreCategory = category;
+ }
+ if (logprob < minScore) {
+ minScore = logprob;
+ }
+ }
+ double acc = 0d;
+ for (Double d : probs.values()) {
+ acc += Math.pow(Math.E, d);
+ }
+ double maxProb = Math.pow(Math.E, maxScore) / acc;
+ double minProb = Math.pow(Math.E, minScore) / acc;
+
+ if (maxProb < 0.4 || (maxProb - minScore) < 0.1)
+ return null;
+ return maxScoreCategory; //return the category with heighest score
+ }
+}
diff --git a/src/main/java/com/datumbox/opensource/dataobjects/Document.java b/src/main/java/com/datumbox/opensource/dataobjects/Document.java
new file mode 100644
index 0000000..c749250
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/dataobjects/Document.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.dataobjects;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * The Document Object represents the texts that we use for training or
+ * prediction as a bag of words.
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class Document {
+
+ /**
+ * List of token counts
+ */
+ public Map tokens;
+
+ /**
+ * The class of the document
+ */
+ public String category;
+
+ /**
+ * Document constructor
+ */
+ public Document() {
+ tokens = new HashMap();
+ }
+}
diff --git a/src/main/java/com/datumbox/opensource/dataobjects/FeatureStats.java b/src/main/java/com/datumbox/opensource/dataobjects/FeatureStats.java
new file mode 100644
index 0000000..1374adb
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/dataobjects/FeatureStats.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.dataobjects;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * The FeatureStats Object stores all the fields generated by the FeatureExtraction
+ * class.
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class FeatureStats {
+ /**
+ * total number of Observations
+ */
+ public int n;
+
+ /**
+ * It stores the co-occurrences of Feature and Category values
+ */
+ public Map> featureCategoryJointCount;
+
+ /**
+ * Measures how many times each category was found in the training dataset.
+ */
+ public Map categoryCounts;
+
+ /**
+ * Constructor
+ */
+ public FeatureStats() {
+ n = 0;
+ featureCategoryJointCount = new HashMap<>();
+ categoryCounts = new HashMap<>();
+ }
+}
diff --git a/src/main/java/com/datumbox/opensource/dataobjects/NaiveBayesKnowledgeBase.java b/src/main/java/com/datumbox/opensource/dataobjects/NaiveBayesKnowledgeBase.java
new file mode 100644
index 0000000..dba23db
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/dataobjects/NaiveBayesKnowledgeBase.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.dataobjects;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * The NaiveBayesKnowledgeBase Object stores all the fields that the classifier
+ * learns during training.
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class NaiveBayesKnowledgeBase {
+ /**
+ * number of training observations
+ */
+ public int n=0;
+
+ /**
+ * number of categories
+ */
+ public int c=0;
+
+ /**
+ * number of features
+ */
+ public int d=0;
+
+ /**
+ * log priors for log( P(c) )
+ */
+ public Map logPriors = new HashMap<>();
+
+ /**
+ * log likelihood for log( P(x|c) )
+ */
+ public Map> logLikelihoods = new HashMap<>();
+}
diff --git a/src/main/java/com/datumbox/opensource/features/FeatureExtraction.java b/src/main/java/com/datumbox/opensource/features/FeatureExtraction.java
new file mode 100644
index 0000000..c47e36e
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/features/FeatureExtraction.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.features;
+
+//import com.datumbox.opensource.dataobjects.Document;
+//import com.datumbox.opensource.dataobjects.FeatureStats;
+
+import com.datumbox.opensource.dataobjects.Document;
+import com.datumbox.opensource.dataobjects.FeatureStats;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * FeatureExtraction class which is used to generate the FeatureStats Object
+ * from the dataset and perform feature selection by using the Chisquare test.
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class FeatureExtraction {
+
+ /**
+ * Generates a FeatureStats Object with metrics about he occurrences of the
+ * keywords in categories, the number of category counts and the total number
+ * of observations. These stats are used by the feature selection algorithm.
+ *
+ * @param dataset
+ * @return
+ */
+ public FeatureStats extractFeatureStats(List dataset) {
+ FeatureStats stats = new FeatureStats();
+
+ Integer categoryCount;
+ String category;
+ Integer featureCategoryCount;
+ String feature;
+ Map featureCategoryCounts;
+ for(Document doc : dataset) {
+ ++stats.n; //increase the number of observations
+ category = doc.category;
+
+
+ //increase the category counter by one
+ categoryCount = stats.categoryCounts.get(category);
+ if(categoryCount==null) {
+ stats.categoryCounts.put(category, 1);
+ }
+ else {
+ stats.categoryCounts.put(category, categoryCount+1);
+ }
+
+ for(Map.Entry entry : doc.tokens.entrySet()) {
+ feature = entry.getKey();
+
+ //get the counts of the feature in the categories
+ featureCategoryCounts = stats.featureCategoryJointCount.get(feature);
+ if(featureCategoryCounts==null) {
+ //initialize it if it does not exist
+ stats.featureCategoryJointCount.put(feature, new HashMap());
+ }
+
+ featureCategoryCount=stats.featureCategoryJointCount.get(feature).get(category);
+ if(featureCategoryCount==null) {
+ featureCategoryCount=0;
+ }
+
+ //increase the number of occurrences of the feature in the category
+ stats.featureCategoryJointCount.get(feature).put(category, ++featureCategoryCount);
+ }
+ }
+
+ return stats;
+ }
+
+ /**
+ * Perform feature selection by using the chisquare non-parametrical
+ * statistical test.
+ *
+ * @param stats
+ * @param criticalLevel
+ * @return
+ */
+ public Map chisquare(FeatureStats stats, double criticalLevel) {
+ Map selectedFeatures = new HashMap<>();
+
+ String feature;
+ String category;
+ Map categoryList;
+
+ int N1dot, N0dot, N00, N01, N10, N11;
+ double chisquareScore;
+ Double previousScore;
+ for(Map.Entry> entry1 : stats.featureCategoryJointCount.entrySet()) {
+ feature = entry1.getKey();
+ categoryList = entry1.getValue();
+
+ //calculate the N1. (number of documents that have the feature)
+ N1dot = 0;
+ for(Integer count : categoryList.values()) {
+ N1dot+=count;
+ }
+
+ //also the N0. (number of documents that DONT have the feature)
+ N0dot = stats.n - N1dot;
+
+ for(Map.Entry entry2 : categoryList.entrySet()) {
+ category = entry2.getKey();
+ N11 = entry2.getValue(); //N11 is the number of documents that have the feature and belong on the specific category
+ N01 = stats.categoryCounts.get(category)-N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category
+
+ N00 = N0dot - N01; //N00 counts the number of documents that don't have the feature and don't belong to the specific category
+ N10 = N1dot - N11; //N10 counts the number of documents that have the feature and don't belong to the specific category
+
+ //calculate the chisquare score based on the above statistics
+ chisquareScore = stats.n*Math.pow(N11*N00-N10*N01, 2)/((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00));
+
+ //if the score is larger than the critical value then add it in the list
+ if(chisquareScore>=criticalLevel) {
+ previousScore = selectedFeatures.get(feature);
+ if(previousScore==null || chisquareScore>previousScore) {
+ selectedFeatures.put(feature, chisquareScore);
+ }
+ }
+ }
+ }
+
+ return selectedFeatures;
+ }
+}
+
diff --git a/src/main/java/com/datumbox/opensource/features/TextTokenizer.java b/src/main/java/com/datumbox/opensource/features/TextTokenizer.java
new file mode 100644
index 0000000..6db8f17
--- /dev/null
+++ b/src/main/java/com/datumbox/opensource/features/TextTokenizer.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2014 Vasilis Vryniotis
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package com.datumbox.opensource.features;
+
+import com.datumbox.opensource.dataobjects.Document;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * TextTokenizer class used to tokenize the texts and store them as Document
+ * objects.
+ *
+ * @author Vasilis Vryniotis
+ * @see http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/
+ */
+public class TextTokenizer {
+
+ /**
+ * Preprocess the text by removing punctuation, duplicate spaces and
+ * lowercasing it.
+ *
+ * @param text
+ * @return
+ */
+ public static String preprocess(String text) {
+ return text.replaceAll("\\p{P}", " ").replaceAll("\\s+", " ").toLowerCase(Locale.getDefault());
+ }
+
+ /**
+ * A simple method to extract the keywords from the text. For real world
+ * applications it is necessary to extract also keyword combinations.
+ *
+ * @param text
+ * @return
+ */
+ public static String[] extractKeywords(String text) {
+ return text.split(" ");
+ }
+
+ /**
+ * Counts the number of occurrences of the keywords inside the text.
+ *
+ * @param keywordArray
+ * @return
+ */
+ public static Map getKeywordCounts(String[] keywordArray) {
+ Map counts = new HashMap<>();
+
+ Integer counter;
+ for(int i=0;i trainingFiles;
+ public static NaiveBayesKnowledgeBase knowledgeBase;
+
+ static {
+ {
+ trainingFiles = new HashMap<>();
+ trainingFiles.put(PageCategory.HOBBIES.name(), BayesStaticInitializer.class.getResource("/training.hobby.txt"));
+ trainingFiles.put(PageCategory.FOOD.name(), BayesStaticInitializer.class.getResource("/training.food.txt"));
+ trainingFiles.put(PageCategory.AUTO.name(), BayesStaticInitializer.class.getResource("/training.auto.txt"));
+ }
+ {
+ Map trainingExamples = new HashMap<>();
+ for (Map.Entry entry : trainingFiles.entrySet()) {
+ try {
+ if(entry.getValue() == null)
+ continue;
+ trainingExamples.put(entry.getKey(), readLines(entry.getValue()));
+ } catch (IOException e) {
+ throw new RuntimeException("Can't initialize application", e);
+ }
+ }
+
+ //train classifier
+ NaiveBayes nb = new NaiveBayes();
+ nb.setChisquareCriticalValue(6.63); //0.01 pvalue
+ nb.train(trainingExamples);
+
+ //get trained classifier knowledgeBase
+ knowledgeBase = nb.getKnowledgeBase();
+ }
+ }
+
+ public static String[] readLines(URL url) throws IOException {
+
+ Reader fileReader = new InputStreamReader(url.openStream(), Charset.forName("UTF-8"));
+ List lines;
+ try (BufferedReader bufferedReader = new BufferedReader(fileReader)) {
+ lines = new ArrayList<>();
+ String line;
+ while ((line = bufferedReader.readLine()) != null) {
+ lines.add(line);
+ }
+ }
+ return lines.toArray(new String[lines.size()]);
+ }
+}
diff --git a/src/main/java/com/getintent/interview/SimplePageContextualizer.java b/src/main/java/com/getintent/interview/SimplePageContextualizer.java
index 2702ce0..7349fc9 100644
--- a/src/main/java/com/getintent/interview/SimplePageContextualizer.java
+++ b/src/main/java/com/getintent/interview/SimplePageContextualizer.java
@@ -1,8 +1,28 @@
package com.getintent.interview;
+
+import com.datumbox.opensource.classifiers.NaiveBayes;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+
+import java.io.IOException;
+
public class SimplePageContextualizer implements PageContextualizer {
- @Override
+
public PageCategory contextualize(String url) {
- throw new UnsupportedOperationException("Need to implement");
+ try {
+ return innerContext(url);
+ } catch (Exception e) {
+ return PageCategory.UNKNOWN;
+ }
+ }
+
+ private PageCategory innerContext(String url) throws IOException {
+ Document page = Jsoup.connect(url).get();
+ ;
+ NaiveBayes naiveBayes = new NaiveBayes(BayesStaticInitializer.knowledgeBase);
+ String outputEn = naiveBayes.predict(page.text());
+ return PageCategory.valueOf(outputEn);
}
}
diff --git a/src/main/resources/training.auto.txt b/src/main/resources/training.auto.txt
new file mode 100644
index 0000000..b0c1689
--- /dev/null
+++ b/src/main/resources/training.auto.txt
@@ -0,0 +1,828 @@
+car
+auto
+sedans
+acoustic
+airplanes
+amplitude
+autos
+automobile
+automobiles
+averaged
+backscattered
+bandlimited
+bicycles
+bikes
+blackbody
+boats
+broadband
+cabs
+carriages
+chemiluminescent
+convolved
+correlation
+drivers
+excitation
+fluorescence
+infrared
+locomotives
+lorries
+modulation
+motorcars
+motorcycles
+optical
+photoacoustic
+photon
+photothermal
+polarized
+probe
+racehorses
+ships
+steamers
+tractors
+trucks
+vehicles
+used
+aaa
+motorcade
+turntable
+armored
+grid
+heaps
+abet
+avis
+bumper car
+car park
+cavalcade
+couple
+fleet
+leased
+ram
+taxis
+automobile traffic
+bmgs
+breaker
+cable car
+car traffic
+cords
+crash
+creeper
+crush
+custom-built
+dali
+darts
+dented
+dock
+drag
+dragsters
+fins
+fourth
+junkers
+korean
+limited
+loaded
+loaners
+magic
+make
+ocasek
+orr
+owen
+park
+parking area
+parking lot
+pylon
+ramp
+rentals
+roomy
+royce
+sell
+sleek
+sleepers
+smash-up
+car
+auto
+sedans
+acoustic
+airplanes
+amplitude
+autos
+automobile
+automobiles
+averaged
+backscattered
+bandlimited
+bicycles
+bikes
+blackbody
+boats
+cabs
+carriages
+convolved
+correlation
+excitation
+fluorescence
+infrared
+locomotives
+lorries
+modulation
+motorcars
+motorcycles
+photoacoustic
+photon
+photothermal
+polarized
+probe
+racehorses
+ships
+steamers
+tractors
+vehicles
+aaa
+motorcade
+turntable
+grid
+heaps
+bumper car
+car park
+cavalcade
+couple
+fleet
+ram
+taxis
+automobile traffic
+bmgs
+breaker
+cable car
+car traffic
+cords
+crash
+creeper
+crush
+dali
+darts
+dock
+drag
+dragsters
+fins
+fourth
+junkers
+korean
+loaners
+magic
+make
+ocasek
+orr
+owen
+park
+parking area
+parking lot
+pylon
+ramp
+rentals
+sell
+sleepers
+smash-up
+speedway
+stock car
+transfer
+tyres
+which
+garage
+train
+funicular
+racer
+tipple
+cable railway
+paddock
+telescope
+yard
+barn
+bucket seat
+carport
+convoy
+coupling
+crazy
+dodgem
+fin
+flash
+freight train
+grease monkey
+hardtop
+locomotive
+monorail
+pace car
+pace lap
+pit
+produce
+race
+railroad
+roller coaster
+rumble seat
+running board
+shackle
+showroom
+shunter
+stable
+switch
+telpher
+tramway
+turnout
+whip
+wrecker
+addiction
+anti-sway bar
+aquaplane
+archibald wheel
+aristocrat
+assortment
+autocross
+automobile driver
+baccarat
+big-ticket item
+blue+book
+boat
+boneyard
+breakdown van
+bridle path
+bridle road
+brightwork
+buggy whip
+cableway
+car carrier
+car dealer
+car mileage
+car port
+career
+carhop
+chamois cloth
+chemin de fer
+chicane
+chop shop
+clear
+concours d'lgance
+condition
+cosmetic
+coupler
+cruise
+demolition+derby
+dents
+derv
+description
+dicky
+didicoy
+diners
+dinkey
+drag link
+drag race
+draw gear
+drawgear
+drive-in
+ecurie
+engineering
+fifth
+following
+formula
+freewheel
+fuel
+fuel cell
+funicular railway
+gasohol
+gondolas
+greaser
+gustavus franklin swift
+gutter
+gymkhana
+haulage
+have
+hire-purchase
+hood
+hurries
+indulgence
+jinny road
+
+joyride
+judge
+junkyard
+kill
+lay-by
+le mans start
+look-alike
+manifest
+mgs
+midget
+motoring
+opels
+parade lap
+part
+pintsch gas
+pit lane
+plaza
+porter
+position
+predilection
+pull-off
+pullman porter
+qualifying
+queue
+quote
+race driver
+rads
+rail
+railroad flat
+railroad train
+railway yard
+rallycross
+rattler
+repos
+rest area
+rest stop
+ric
+rods
+roll cage
+rubberneck
+safari park
+saleroom
+salesroom
+seatrain
+see
+service department
+shampoo
+she
+shoulder
+skidpan
+slidometer
+stabilizer bar
+staith
+starting grid
+superhighway
+sway bar
+swift
+switchman
+switchyard
+tachometer
+telfer
+telferage
+telpherage
+three-wheeler
+time
+tow car
+tow truck
+tracer
+track
+trailer
+tunnel
+turin
+turn
+used-car lot
+valets
+waiting
+warrant
+web
+widening
+yellowtail
+zip
+component
+drawbar
+equipment
+ferris wheel
+motorcycle
+waste
+sedan
+limo
+coupe
+dent
+siren
+tram
+apr
+axle
+bra
+demo
+diner
+gas
+lease
+accelerator
+airbag
+air conditioner
+air conditioning
+air filter
+air vent
+alarm
+all-wheel drive
+alternator
+antenna
+anti-lock brakes
+armrest
+auto
+automatic transmission
+automobile
+axle
+baby car seat
+baby seat
+back-up lights
+battery
+bench seat
+bonnet
+brake light
+brake pedal
+brakes
+bucket seat
+bumper
+camshaft
+car
+carburetor
+catalytic converter
+chassis
+child car seat
+chrome trim
+clutch
+computer
+console
+cooling system
+crankshaft
+cruise control
+cylinder
+dashboard
+defroster
+diesel engine
+dip stick
+differential
+door
+door handle
+drive belt
+drive shaft
+driver's seat
+emergency brake
+emergency lights
+emissions
+engine
+engine block
+exhaust pipe
+exhaust system
+fan belt
+fender
+filter
+floor mat
+fog light
+four-wheel drive
+frame
+fuel
+fuel cap
+fuel gauge
+fuse
+gas
+gasket
+gas pedal
+gas gauge
+gasoline
+gas tank
+gauge
+gearbox
+gear shift
+gear stick
+glove compartment
+GPS
+grille
+hand brake
+headlamp
+headlight
+headrest
+heater
+high-beam headlights
+hood
+horn
+hubcap
+hybrid
+ignition
+instrument panel
+interior light
+internal combustion engine
+jack
+key
+license plate
+lights
+lock
+low-beam headlights
+lug bolt
+lug nut
+manifold
+manual transmission
+mat
+mirror
+moon roof
+motor
+mud flap
+muffler
+navigation system
+odometer
+oil
+oil filter
+oil tank
+parking brake
+parking lights
+passenger seat
+pedal
+piston
+power brakes
+power steering
+power window switch
+radiator
+radio
+rag top
+rear-view mirror
+rear window defroster
+reverse light
+rims
+roof
+roof rack
+rotary engine
+seat
+seat belt
+shift
+shock absorber
+side airbags
+side mirror
+spare tire
+spark plug
+speaker
+speedometer
+spoiler
+starter
+steering column
+steering wheel
+sunroof
+sun visor
+suspension
+tachometer
+tailgate
+temperature gauge
+thermometer
+tire
+trailer hitch
+transmission
+trim
+trip computer
+trunk
+turbo charger
+turn signal
+undercarriage
+unleaded gas
+valve
+vents
+visor
+warning light
+wheel
+wheel well
+window
+windshield
+windshield wiper
+accelerator
+airbag
+air conditioner
+air conditioning
+air filter
+air vent
+alarm
+all-wheel drive
+alternator
+antenna
+anti-lock brakes
+armrest
+auto
+automatic transmission
+automobile
+axle
+baby car seat
+baby seat
+back-up lights
+battery
+bench seat
+bonnet
+brake light
+brake pedal
+brakes
+bucket seat
+bumper
+camshaft
+car
+carburetor
+catalytic converter
+chassis
+child car seat
+chrome trim
+clutch
+computer
+console
+cooling system
+crankshaft
+cruise control
+cylinder
+dashboard
+defroster
+diesel engine
+dip stick
+differential
+door
+door handle
+drive belt
+drive shaft
+driver's seat
+emergency brake
+emergency lights
+emissions
+engine
+engine block
+exhaust pipe
+exhaust system
+fan belt
+fender
+filter
+floor mat
+fog light
+four-wheel drive
+frame
+fuel
+fuel cap
+fuel gauge
+fuse
+gas
+gasket
+gas pedal
+gas gauge
+gasoline
+gas tank
+gauge
+gearbox
+gear shift
+gear stick
+glove compartment
+GPS
+grille
+hand brake
+headlamp
+headlight
+headrest
+heater
+high-beam headlights
+hood
+horn
+hubcap
+hybrid
+ignition
+instrument panel
+interior light
+internal combustion engine
+jack
+key
+license plate
+lights
+lock
+low-beam headlights
+lug bolt
+lug nut
+manifold
+manual transmission
+mat
+mirror
+moon roof
+motor
+mud flap
+muffler
+navigation system
+odometer
+oil
+oil filter
+oil tank
+parking brake
+parking lights
+passenger seat
+pedal
+piston
+power brakes
+power steering
+power window switch
+radiator
+radio
+rag top
+rear-view mirror
+rear window defroster
+reverse light
+rims
+roof
+roof rack
+rotary engine
+seat
+seat belt
+shift
+shock absorber
+side airbags
+side mirror
+spare tire
+spark plug
+speaker
+speedometer
+spoiler
+starter
+steering column
+steering wheel
+sunroof
+sun visor
+suspension
+tachometer
+tailgate
+temperature gauge
+thermometer
+tire
+trailer hitch
+transmission
+trim
+trip computer
+trunk
+turbo charger
+turn signal
+undercarriage
+unleaded gas
+valve
+vents
+visor
+warning light
+wheel
+wheel well
+window
+windshield
+windshield wiper
+cceleration
+accessories
+adventure
+agility
+air injection
+alloys
+automobile
+boxer engine
+brake pad
+car
+carbon fiber
+comfort
+construction
+control
+convertible
+coupe
+crash rating
+cruise control
+design
+drafting
+driver
+driving
+elegance
+emission
+engine
+engineering
+excellence
+experience
+exterior
+feature
+feeling
+form
+function
+handling
+horsepower
+incentive
+innovation
+integration
+interior
+luxury
+make
+mile
+mileage
+model
+motor
+motorcycle
+options
+passenger
+perfection
+performance
+power
+precision
+pricing
+profile
+quality
+refinement
+ride
+road
+road contact
+safety
+sedan
+sophistication
+specs
+speed
+sportiness
+sportster
+standard
+style
+styling
+system
+technology
+throttle
+transmission
+trims
+vehicle
+versatility
+
diff --git a/src/main/resources/training.food.txt b/src/main/resources/training.food.txt
new file mode 100644
index 0000000..5c47e73
--- /dev/null
+++ b/src/main/resources/training.food.txt
@@ -0,0 +1,1271 @@
+acornsquash
+alfalfasprouts
+almond
+anchovy
+anise
+appetizer
+appetite
+apple
+apricot
+artichoke
+asparagus
+aspic
+ate
+avocado
+bacon
+bagel
+bake
+bakedAlaska
+bambooshoots
+banana
+barbecue
+barley
+basil
+batter
+beancurd
+beans
+beef
+beet
+bellpepper
+berry
+biscuit
+bitter
+blackbeans
+blackberry
+black-eyedpeas
+blacktea
+bland
+bloodorange
+blueberry
+boil
+bowl
+boysenberry
+bran
+bread
+breadfruit
+breakfast
+brisket
+broccoli
+broil
+brownie
+brownrice
+brunch
+Brusselssprouts
+buckwheat
+buns
+burrito
+butter
+butterbean
+cake
+calorie
+candy
+candyapple
+cantaloupe
+capers
+caramel
+caramelapple
+carbohydrate
+carrot
+cashew
+cassava
+casserole
+cater
+cauliflower
+caviar
+cayennepepper
+celery
+cereal
+chard
+cheddar
+cheese
+cheesecake
+chef
+cherry
+chew
+chicken
+chickpeas
+chili
+chips
+chives
+chocolate
+chopsticks
+chow
+chutney
+cilantro
+cinnamon
+citron
+citrus
+clam
+cloves
+cobbler
+coconut
+cod
+coffee
+coleslaw
+collardgreens
+comestibles
+cook
+cookbook
+cookie
+corn
+cornflakes
+cornmeal
+cottagecheese
+crab
+crackers
+cranberry
+cream
+creamcheese
+crepe
+crisp
+crunch
+crust
+cucumber
+cuisine
+cupboard
+cupcake
+curds
+currants
+curry
+custard
+daikon
+dailybread
+dairy
+dandeliongreens
+Danishpastry
+dates
+dessert
+diet
+digest
+digestivesystem
+dill
+dine
+diner
+dinner
+dip
+dish
+dough
+doughnut
+dragonfruit
+dressing
+dried
+drink
+dry
+durian
+eat
+Edamcheese
+edible
+egg
+eggplant
+elderberry
+endive
+entree
+fast
+fat
+favabans
+feast
+fed
+feed
+fennel
+fig
+fillet
+fire
+fish
+flan
+flax
+flour
+food
+foodpyramid
+foodstuffs
+fork
+freezer
+Frenchfries
+fried
+fritter
+frosting
+fruit
+fry
+
+G
+garlic
+gastronomy
+gelatin
+ginger
+gingerale
+gingerbread
+glasses
+Goudacheese
+grain
+granola
+grape
+grapefruit
+grated
+gravy
+greenbean
+greens
+greentea
+grub
+guacamole
+guava
+gyro
+
+H
+herbs
+halibut
+ham
+hamburger
+hash
+hazelnut
+herbs
+honey
+honeydew
+horseradish
+hot
+hotdog
+hotsauce
+hummus
+hunger
+hungry
+ice
+iceberglettuce
+icedtea
+icing
+icecream
+icecreamcone
+jackfruit
+jalapeno
+jam
+jelly
+jellybeans
+jicama
+jimmies
+Jordanalmonds
+jug
+julienne
+juice
+junkfood
+kale
+kebab
+ketchup
+kettle
+kettlecorn
+kidneybeans
+kitchen
+kiwi
+knife
+kohlrabi
+kumquat
+
+ladle
+lamb
+lard
+lasagna
+legumes
+lemon
+lemonade
+lentils
+lettuce
+licorice
+limabeans
+lime
+liver
+loaf
+lobster
+lollipop
+loquat
+lox
+lunch
+lunchbox
+lunchmeat
+lychee
+macaroni
+macaroon
+maincourse
+maize
+mandarinorange
+mango
+maplesyrup
+margarine
+marionberry
+marmalade
+marshmallow
+mashedpotatoes
+mayonnaise
+meat
+meatball
+meatloaf
+melon
+menu
+meringue
+micronutrient
+milk
+milkshake
+millet
+mincemeat
+minerals
+mint
+mints
+mochi
+molasses
+molesauce
+mozzarella
+muffin
+mug
+munch
+mushroom
+mussels
+mustard
+mustardgreens
+mutton
+napkin
+nectar
+nectarine
+nibble
+noodles
+nosh
+nourish
+nourishment
+nut
+nutmeg
+nutrient
+nutrition
+nutritious
+
+oats
+oatmeal
+oil
+okra
+oleo
+olive
+omelet
+omnivore
+onion
+orange
+order
+oregano
+oven
+oyster
+
+pan
+pancake
+papaya
+parsley
+parsnip
+pasta
+pastry
+pate
+patty
+pattypansquash
+peach
+peanut
+peanutbutter
+pea
+pear
+pecan
+peapod
+pepper
+pepperoni
+persimmon
+pickle
+picnic
+pie
+pilaf
+pineapple
+pitabread
+pitcher
+pizza
+plate
+platter
+plum
+poached
+pomegranate
+pomelo
+pop
+popsicle
+popcorn
+popovers
+pork
+porkchops
+pot
+potato
+potroast
+preserves
+pretzel
+primerib
+protein
+provisions
+prune
+pudding
+pumpernickel
+pumpkin
+punch
+
+quiche
+quinoa
+
+radish
+raisin
+raspberry
+rations
+ravioli
+recipe
+refreshments
+refrigerator
+relish
+restaurant
+rhubarb
+ribs
+rice
+roast
+roll
+rollingpin
+romaine
+rosemary
+rye
+saffron
+sage
+salad
+salami
+salmon
+salsa
+salt
+sandwich
+sauce
+sauerkraut
+sausage
+savory
+scallops
+scrambled
+seaweed
+seeds
+sesameseed
+shallots
+sherbet
+shishkebab
+shrimp
+slaw
+slice
+smoked
+snack
+soda
+sodabread
+sole
+sorbet
+sorghum
+sorrel
+soup
+sour
+sourcream
+soy
+soybeans
+soysauce
+spaghetti
+spareribs
+spatula
+spices
+spicy
+spinach
+splitpeas
+spoon
+spork
+sprinkles
+sprouts
+spuds
+squash
+squid
+steak
+stew
+stir-fry
+stomach
+stove
+straw
+strawberry
+stringbean
+stringy
+strudel
+subsandwich
+submarinesandwich
+succotash
+suet
+sugar
+summersquash
+sundae
+sunflower
+supper
+sushi
+sustenance
+sweet
+sweetpotato
+Swisschard
+syrup
+taco
+take-out
+tamale
+tangerine
+tapioca
+taro
+tarragon
+tart
+tea
+teapot
+teriyaki
+thyme
+toast
+toaster
+toffee
+tofu
+tomatillo
+tomato
+torte
+tortilla
+tuber
+tuna
+turkey
+turmeric
+turnip
+
+uglifruit
+unleavened
+utensils
+
+vanilla
+veal
+vegetable
+venison
+vinegar
+vitamin
+
+wafer
+waffle
+walnut
+wasabi
+water
+waterchestnut
+watercress
+watermelon
+wheat
+whey
+whippedcream
+wok
+
+yam
+yeast
+yogurt
+yolk
+
+Blackened
+Blah
+Blanched
+Bland
+Blended
+Bold
+Bolognese
+Boned
+Brackish
+Braised
+Brewed
+Briny
+Brittle
+Broiled
+Browned
+Bubbly
+Burning
+Bursting
+Buttercream
+Butterflied
+Buttery
+Cacciatore
+Cakey
+Candied
+Canned
+Caramelized
+Caustic
+Chalky
+Acidic
+Acrid
+Airy
+Alacarte
+Alaking
+Alamode
+Alcoholic
+Aldente
+Almondflavored
+Ambrosial
+Appetizing
+Aroma
+Aromatic
+Aufromage
+Augratin
+Aujus
+Balsamic
+Barbecue
+Battered
+Béarnaise
+Bite-size
+Biting
+Bitter
+Blackened
+Blah
+Blanched
+Bland
+Blended
+Charcuterie
+Charred
+Cheesy
+Chewy
+Chili
+Chilled
+Chipotle
+Chocolaty
+Chopped
+Chowder
+Clarified
+Classical
+ComfortFood
+Condensed
+Condiment
+Course
+Creamed
+Creamery
+Creamy
+Creole
+Crisscrossed
+Crispy
+Crumbly
+Crunchy
+Crusty
+Crystalized
+Cuisine
+Curd
+Curdled
+Cured
+Curried
+Dash
+Decadent
+Deglaze
+Dehyrated
+Delectable
+Delicious
+Delightful
+Dense
+Devein
+Deviled
+Dietary
+Diluted
+Dipping
+Disagreeable
+Disgusting
+Distasteful
+Distinctive
+Divine
+Doughy
+Dredged
+Drenched
+Dripping
+Driedout
+Drizzled
+Dry
+Dry-Roasted
+Dull
+Dusted
+Earthy
+Eatable
+Edible
+Enjoyable
+Enticing
+Entrée
+Escalloped
+Etouffee
+Evaporated
+Exquisite
+Fatty
+Fermented
+Fine
+FingerLickingGood
+Fibrous
+Filled
+Filling
+Fiery
+Fishy
+Fizzy
+Flakey
+Flambé
+Flavorless
+Flavorsome
+Florentine
+Floury
+Fluffy
+Foiegras
+Folded
+Fondant
+Foul
+Fradiablo
+Fragrant
+Feathery
+Fresh
+Freezedried
+Fricasseed
+Fried
+Frosty
+Frozen
+Fruity
+Fudgy
+Full-bodied
+Full-flavored
+Gamy
+Garlicky
+Garnish
+Gastric
+Gingery
+Glazed
+Glopy
+Glossy
+Gluteny
+Golden
+Good
+Gooey
+Grainy
+Granulated
+Grated
+Gratifying
+Greasy
+Griddled
+Grilled
+Gritty
+Gross
+Hardboiled
+Heady
+Heat
+Heavy
+Healthy
+Hearty
+Heavenly
+Herbaceous
+Hint
+Homogenized
+Honeyed
+Horsd’oeuvre
+Hot
+HotSauce
+Icy
+Infused
+Intense
+Inviting
+Jiggly
+Juicy
+Julienne
+Kick
+Kneaded
+Kosher
+Laced
+Laden
+Laiche
+Layered
+Lemony
+Light
+Limp
+Lip-smacking
+Liquid
+Low-Fat
+Lumpy
+Luscious
+Lusty
+Lyonnaise
+Malodorous
+Marinate
+Marvelous
+Mashed
+Mealy
+Medium
+Mellow
+Melting
+Messy
+Microwave
+Mild
+Milky
+Minced
+Minty
+Mixed
+Mixture
+Moist
+Moldy
+mouth-watering
+Muddy
+Mushy
+Nasty
+Natural
+Nauseating
+Nectarous
+Nosey
+Nourishing
+Noxious
+Nuked
+Nutriment
+Nutritious
+Nutty
+Odoriferous
+Odorless
+Oily
+Oniony
+Oozing
+Organic
+Overpowering
+Packed
+Palatable
+Pan-fried
+Paprika
+Parboiled
+Parched
+Parfait
+Pasteurized
+Pasty
+Pâté
+Peanutbutter
+Peck
+Penetrating
+Peppered
+Peppery
+Perfumed
+Perishable
+Piccata
+Pickled
+Piping
+Piquant
+Pleasant
+Powdered
+Powdery
+Potent
+Pouched
+Preserved
+Puffy
+Pulp
+Pulverized
+Pungent
+Puréed
+Ragout
+Rancid
+Rank
+Rare
+Raw
+Redolent
+Reduced
+Reeking
+Refrigerated
+Refreshing
+Relish
+Rich
+RibSticking
+Ripe
+Roasted
+Robust
+Rolled
+Rotten
+Roux
+Ruined
+Runny
+Saline
+Salted
+Salty
+Saturated
+Sapid
+Saporous
+Sauté
+Savory
+Scalded
+Scented
+Scorched
+Scrambled
+Scrumptious
+Seared
+Seasoned
+Sharp
+Shredded
+Sizzling
+Simmering
+Skimmed
+Skunky
+Slathered
+Sliced
+Slimy
+Slippery
+Slivered
+Smelly
+Smokey
+Smooth
+Smothered
+Snappy
+Snappy
+Soaked
+Sodden
+Soft
+soft-boiled
+Soggy
+Solid
+Sordid
+Soufflé
+Soupy
+Sour
+Sparkling
+Spicy
+Spirited
+Spoiled
+Spongy
+Spread
+Sprinkled
+Spritzed
+Stale
+Starchy
+Steamy
+Stewed
+Stiff
+Stinging
+Stringy
+Stinky
+Strong
+Stuffed
+Subdued
+Succulent
+Sunnysideup
+sugar-coated
+Sugary
+Sweet
+SweetandSour
+Syrupy
+Tainted
+Tangy
+Tantalizing
+Tart
+Tasteless
+Tasty
+Tempting
+Tender
+Tepid
+Texture
+Thick
+Titillating
+Toasted
+Toothsome
+Tough
+Tumaceous
+Umami
+Unsavory
+Vanilla
+Velvety
+Vicious
+Vinegary
+Warm
+Watery
+Well-done
+Wet
+Whey
+Whipped
+Wholesome
+Wild
+Wilted
+Wrapped
+Yucky
+Yummy
+Zest
+Zestful
+Zesty
+Zippy
+forage
+meat
+cereal
+eats
+provender
+edible
+fish
+fodder
+aliment
+foodstuff
+game
+victuals
+seasoning
+eating
+nourishment
+nutriment
+staple
+condiment
+nutrient
+vegetable
+rations
+domestic
+fuel
+manure
+foods
+comestibles
+drink
+equipment
+fertilizer
+foodlessness
+foodstuffs
+gadoid
+harvestable
+munition
+nonferrous
+nonfood
+pharmaceutical
+provisions
+refreshments
+saltwater
+storable
+surplus
+feed
+manna
+taco
+diet
+poi
+tofu
+fast
+taro
+tuna
+usda
+dish
+egg
+mess
+slop
+puree
+rice
+pap
+serve
+cola
+bran
+chow
+cod
+cook
+msg
+togo
+victual
+bite
+bread
+cake
+eel
+grub
+opah
+pasta
+sauce
+shad
+taste
+bait
+deli
+meal
+salad
+smelt
+famine
+prey
+repast
+sole
+ambrosia
+fries
+hay
+plate
+provision
+viand
+agar
+can
+cero
+cud
+eatable
+fare
+grain
+salt
+tamale
+whiting
+board
+chili
+chyme
+dietary
+eggs
+hunger
+larder
+ling
+pabulum
+seed
+snapper
+suet
+sushi
+sustenance
+yam
+calorie
+caterer
+digest
+dumbwaiter
+epicure
+fowl
+gruel
+hash
+mast
+milk
+nutrition
+oat
+pompano
+scad
+soda
+soul
+stomach
+taster
+trout
+yogurt
+bass
+casserole
+chef
+curd
+fat
+garnish
+gas
+haddock
+hake
+hunt
+morsel
+mullet
+pike
+sago
+saran
+sardine
+spread
+stodge
+table
+takeout
+tray
+ate
+carob
+cheese
+corn
+cuisine
+data
+dole
+dye
+gorge
+kosher
+millet
+nectar
+organic
+perch
+pie
+potato
+preserve
+ration
+recipe
+salmon
+salmonella
+season
+sop
+spice
+wrasse
+appetite
+aspic
+automat
+beef
+bonito
+carp
+chew
+chocolate
+coloring
+colouring
+comestible
+digestion
+fda
+fry
+gar
+gourmet
+grocer
+halibut
+herring
+mackerel
+oleo
+omelet
+pantry
+pea
+pickle
+pizza
+pollack
+porgy
+pot
+rasorial
+refreshment
+roughage
+scallop
+scup
+snook
+soup
+spoon
+swill
+tilefish
+tin
+tinfoil
+turbot
+venison
+alewife
+appetizer
+barley
+bib
+bin
+botulism
+bowl
+catering
+chip
+cibophobia
+cocoa
+commissariat
+commissary
+craw
+delicatessen
+dicer
+feeding
+freezer
+granola
+ham
+ice
+kidney
+krill
+lap
+lientery
+menu
+monophagous
+oil
+order
+phloem
+platter
+pork
+porridge
+prog
+ptomaine
+ricer
+roe
+scrod
+server
+serving
+slaughter
+souse
+soybean
+sprat
+supply
+tapioca
+thai
+tine
+viands
+wrap
+amyloid
+aroma
+brill
+bromatology
+carcass
+cheer
+chicken
+chub
+concoction
+convenience
+food
+crepe
+curry
+dace
+delicacy
+dietetics
+dollop
+eater
+enzyme
+fao
+farina
+feeder
+flour
diff --git a/src/main/resources/training.hobby.txt b/src/main/resources/training.hobby.txt
new file mode 100644
index 0000000..fb28484
--- /dev/null
+++ b/src/main/resources/training.hobby.txt
@@ -0,0 +1,605 @@
+Activities
+Angling
+Antique collecting
+Aquariums
+Art
+Artifacts
+Avocation
+Backpacking
+Ballooning
+Biking
+Bingo
+Bird-watching
+Board games
+Boating
+Bridge
+Camping
+Cards
+Carpentry
+Carving
+Ceramics
+Classes
+Coin collecting
+Collectibles
+Collection
+Cooking
+Crafts
+Crewel
+Crochet
+Crossword
+puzzles
+Dancing
+Dealer
+Decoupage
+Delight
+Embroidery
+Enjoy
+Enrichment
+Entertainment
+Expertise
+Fishing
+Free time
+Gambling
+Games
+Gardening
+Handicrafts
+Hiking
+Hunting
+Ice-skating
+Ikebana
+Interests
+Jogging
+Joy
+Kite flying
+Knitting
+Leisure
+Linger
+Loiter
+Macrame
+Magic
+Model airplanes
+Model cars
+Model trains
+Models
+Mountaineering
+Needlepoint
+Numismatics
+Obsession
+Origami
+Painting
+Passion
+Pastime
+Philately
+Photography
+Playing cards
+Pleasure
+Possession
+Pottery making
+Puppetry
+Purchases
+Puzzles
+Quilting
+Reading
+Relaxation
+Retirement
+Riding
+Rock climbing
+Rug hooking
+Sales
+Scrimshaw
+Scuba diving
+Sewing
+Sharing
+Shells
+Shopping
+Skeet shooting
+Skill
+Skydiving
+Snorkeling
+Spelunking
+Sports
+Stamp collections
+Stitch
+Surfing
+Time
+Trinkets
+Weaving
+Whittling
+Woodworking
+
+Yachting
+Yoga
+
+
+Action Figures
+Antiques
+Autograph Collecting
+Car Collecting
+Coin Collecting
+Comic Books
+Concert Posters
+Doll Collecting
+Fine Art Collecting
+Hot Wheel and Matchbox Cars
+Manga
+Movie
+Music
+Spoon Collecting
+Sports Collectibles
+Sports Trading Cards
+Stamp Collecting
+Vinyl Records
+Watch Collecting
+Gun and PistolsAnimation
+Architecture
+Calligraphy
+Candle Making
+Crochet
+Film Making
+Gardening
+Jewelry Making
+Origami
+Photography
+Sewing
+Sculpting
+Ceramics
+Pottery
+Fashion Design
+Floristry
+Graffiti
+Knitting
+Paper Airplanes
+Painting and Drawing
+Quilting
+Scrapbooking
+Woodworking
+TattooHam Radio
+RC Boats
+RC Cars
+RC Helicopters
+RC Planes
+Robotics
+Scale Models
+Model Cars
+Model Airplanes
+Model Railroading
+Model Rockets
+Model Ship
+Boat Kit
+Dancing
+Ballet
+Break Dancing
+Line Dancing
+Salsa
+Swing
+Tango
+Waltz
+Acting
+Juggling
+Magic Tricks
+Puppetry
+Stand Up Comedy
+Banjo
+Bass Guitar
+Cello
+Clarinet
+Drum Set
+French Horn
+Guitar
+Harmonica
+Oboe
+Piano
+Trumpet
+Trombone
+Violin
+Viola
+Rapping
+Singing
+Bartending
+Beer Brewing
+Beer Tasting
+Cigar Smoking
+Cheese Tasting
+Coffee Roasting
+Competitive Eating
+Cooking
+Liquor Distillation
+Hookah Smoking
+Spirits
+Liquor Tasting
+Sushi Making
+Tea Drinking
+Wine Making
+Wine Tasting
+Sake Tasting
+Grilling
+Cats
+Dogs
+Parrots
+Rabbits
+Reptiles
+Rodents
+Snakes
+Turtles
+FishkeepingArcade Games
+Ball and Jacks
+Billiards
+Pool
+Board Games
+Bridge
+Card Games
+Card Tricks
+Chess
+Dominoes
+Foosball
+Geocaching
+Jigsaw Puzzles
+Mah Jong
+Pinball Machines
+Poker
+Table Tennis - Ping Pong
+Video Games
+Archery
+Acrobatics
+Badminton
+Bodybuilding
+Bowling
+Boxing
+Croquet
+Cycling
+Diving
+Golf
+Gymnastics
+Fencing
+Horseback Riding
+Ice Skating
+Inline Skating
+Pilates
+Running
+Swimming
+Squash
+Tai Chi
+Tennis
+Weight Training
+Yoga
+basketball
+baseball
+football
+cricket
+volleyball
+soccer
+water polo
+Aikido
+Jiu Jitsu
+Judo
+Karate
+Kung Fu
+Taekwondo
+Birdwatching
+Camping
+Fishing
+Hiking
+Hunting
+Kayak and Canoe
+Mountain Biking
+Mountain climbing
+Paintball
+River Rafting
+Rock Climbing
+Sailing
+Scuba Diving
+Fly Fishing
+Backpacking
+Kitesurfing
+Skateboarding
+Skiing
+Snowboarding
+Surfing
+Windsurfing
+Autoracing
+Go Karts
+Motocross
+Motorcycle - Touring
+Motorcycle Stunts
+Off Road Driving
+Snowmobiling
+pastime
+avocation
+cockhorse
+hobbies
+hobbyhorse
+sport
+toy
+amusement
+brokendown
+buggy
+by-line
+chase
+circus
+clydesdale
+confidant
+confidante
+dead
+diversions
+entree
+falco subbuteo
+flemish
+friesian
+frisian
+gaited
+gimmick
+hack
+hobbyist
+icelandic
+livestock
+normandy
+outing
+paced
+pastimes
+place
+playground
+plaything
+playthings
+rocker
+rocking horse
+sideline
+souvenir
+spare-time activity
+stick horse
+treat
+truck
+den
+kit
+ant
+interest
+balsa
+model
+philately
+bag
+dada
+deltiology
+fun
+golf
+bonsai
+bug
+cartophily
+cigarette card
+crochet
+escape
+fad
+fan
+kite
+macrame
+metalworker
+oveta
+poetry
+pursuit
+rocketry
+scripophily
+side
+spelaeology
+speleology
+taxidermy
+violin
+woodcarving
+spelunker
+cave
+give
+horse
+addict
+amateurism
+bird-nesting
+bird-watching
+bottle collection
+boxer
+buff
+building
+collect
+collection
+collector
+domesticity
+fanatic
+garden
+guild
+hobbler
+hobbyhorsical
+notaphily
+occupation
+photographer
+ploy
+spotter
+sunday painter
+tackle
+tinker
+workroom
+enthusiasm
+spa
+eden
+sty
+edda
+nest
+home
+arena
+set
+inn
+bed
+lair
+tee
+flea
+sea
+tomb
+lea
+range
+agora
+altar
+dais
+lab
+pit
+seat
+station
+lei
+camp
+park
+ride
+slide
+haunt
+pram
+hunt
+net
+seal
+tamer
+tent
+trapeze
+zoo
+hell
+lay
+rest
+shrine
+deli
+pen
+plant
+spree
+resort
+seesaw
+ararat
+asia
+bar
+base
+berth
+cache
+chapel
+closet
+corner
+cradle
+desk
+forum
+gym
+hive
+hotel
+lodge
+mat
+mecca
+move
+native
+niche
+pier
+position
+post
+rendezvous
+rink
+roost
+salon
+scar
+shop
+silo
+site
+spot
+stage
+stand
+there
+utopia
+arcade
+box
+grin
+line
+play
+show
+slot
+swing
+amish
+cab
+dune
+run
+stick
+tag
+tree
+cage
+clown
+ring
+stilt
+top
+stable
+repository
+carrion
+corpse
+court
+crypt
+drop
+end
+fall
+grave
+hades
+heaven
+last
+memento
+office
+purgatory
+relic
+requiem
+slough
+sodom
+till
+urn
+zombi
+zombie
+be
+put
+door
+plate
+roast
+salad
+steak
+stew
+venue
+back
+wharf
+saga
+barn
+carry
+farm
+market
+point
+stock
+yard
+norman
+hike
+out
+tour
+visit
+pad
+abode
+alley
+anchorage
+ark
+arm
+asylum
+atom
+attic
+bin
+cafe
+center
+chest
+cloister
+coop
+cot
+cote
+cover
+crib
+deposit
+depot
+desert
+distance
+dock
+domicile
+dump
+ear
+exchange
+field
+first
+garage
+garrison
+gehenna
+go
+habitat
+hall
+harbor