Maluuba · sharan21 · Jan 13, 2022
diff --git a/models/naturalness_classifiers/neural_ARAE.h5 b/models/naturalness_classifiers/neural_ARAE.h5
diff --git a/models/naturalness_classifiers/neural_CAAE.h5 b/models/naturalness_classifiers/neural_CAAE.h5
diff --git a/models/naturalness_classifiers/neural_DAR.h5 b/models/naturalness_classifiers/neural_DAR.h5
diff --git a/models/vectorizer.pkl b/models/vectorizer.pkl
diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py
@@ -9,14 +9,25 @@
 from nlgeval.pycocoevalcap.cider.cider import Cider
 from nlgeval.pycocoevalcap.meteor.meteor import Meteor
 from nlgeval.pycocoevalcap.rouge.rouge import Rouge
-
+from nlgeval.naturalness.naturalness import NaturalnessClassifier
 
 # str/unicode stripping in Python 2 and 3 instead of `str.strip`.
 def _strip(s):
     return s.strip()
 
+def get_sents(paths):
+    sents = []
+    for path in paths:
+        with open(path, 'r') as f:
+            for line in f:
+                sents.append(clean_line(line))
+    return sents
+
+def clean_line(line): 
+	line = line.lower().replace(',','').replace('.','').replace('\'', '').replace('!','').replace(')','').replace('(','').replace(';','').replace('-', ' ').replace('/', ' ').replace('%','').replace('?','').replace('"', '')
+	return line
 
-def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
+def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False, not_naturalness=False, naturalness_model='CAAE'):
     with open(hypothesis, 'r') as f:
         hyp_list = f.readlines()
     ref_list = []
@@ -80,10 +91,17 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
             value = float(value.strip())
             ret_scores[name] = value
 
+    if not not_naturalness:
+        neural_classifier = NaturalnessClassifier(naturalness_model)
+        hyp = get_sents(references)
+        scores = neural_classifier.score(hyp).mean()
+        ret_scores['Naturalness'] = scores
+        print("Mean Naturalness of references: {}".format(ret_scores['Naturalness'])) 
+
     return ret_scores
 
 
-def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False):
+def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False, not_naturalness=False, naturalness_model='CAAE'):
     assert isinstance(hyp, six.string_types)
 
     if isinstance(ref, six.string_types):
@@ -141,6 +159,11 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
             name, value = score.split(':')
             value = float(value.strip())
             ret_scores[name] = value
+
+    if not not_naturalness:
+        neural_classifier = NaturalnessClassifier(naturalness_model)
+        scores = neural_classifier.score(ref).mean()
+        ret_scores['Naturalness'] = scores
 
     return ret_scores
 
@@ -163,8 +186,7 @@ class NLGEval(object):
                         'SkipThoughtCS',
                     } | glove_metrics
 
-    def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
-                 metrics_to_omit=None):
+    def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False, no_naturalness=False, naturalness_model='CAAE', metrics_to_omit=None):
         """
         :param no_overlap: Default: Use overlap metrics.
             `True` if these metrics should not be used.
@@ -194,6 +216,9 @@ def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
             "Invalid metrics to omit: {}".format(self.metrics_to_omit - self.valid_metrics)
 
         self.no_overlap = no_overlap
+        self.no_naturalness = no_naturalness
+        self.naturalness_model = naturalness_model
+
         if not no_overlap:
             self.load_scorers()
 
@@ -282,6 +307,11 @@ def compute_individual_metrics(self, ref, hyp):
                 name, value = score.split(':')
                 value = float(value.strip())
                 ret_scores[name] = value
+
+        if not self.no_naturalness:
+            neural_classifier = NaturalnessClassifier(self.naturalness_model)
+            scores = neural_classifier.score(ref).mean()
+            ret_scores['Naturalness'] = scores
 
         return ret_scores
 
@@ -319,5 +349,13 @@ def compute_metrics(self, ref_list, hyp_list):
                 name, value = score.split(':')
                 value = float(value.strip())
                 ret_scores[name] = value
-
-        return ret_scores
+
+        if not self.no_naturalness:
+            neural_classifier = NaturalnessClassifier(self.naturalness_model)
+            all_refs = []
+            for ref in ref_list:
+                all_refs.extend(ref)
+            scores = neural_classifier.score(all_refs).mean()
+            ret_scores['Naturalness'] = scores
+
+        return ret_scores
diff --git a/nlgeval/naturalness/__init__.py b/nlgeval/naturalness/__init__.py
diff --git a/nlgeval/naturalness/naturalness.py b/nlgeval/naturalness/naturalness.py
@@ -0,0 +1,85 @@
+
+'''
+@article{Mir2019EvaluatingST,
+  title={Evaluating Style Transfer for Text},
+  author={Remi Mir and Bjarke Felbo and Nick Obradovich and Iyad Rahwan},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1904.02295}
+}
+
+Implementation and Pretrained Classifier models from the repo: https://github.com/passeul/style-transfer-model-evaluation
+
+This code can be used to evaluate the naturalness of output sentiment texts of examined style transfer models.
+
+For a baseline understanding of what is considered "natural," any method used for automated evaluation of naturalness 
+also requires an understanding of the human-sourced input texts. 
+
+Inspired by the adversarial evaluation approach in "Generating Sentences from a Continuous Space"
+(Bowman et al., 2016), we trained unigram logistic regression classifiers and LSTM logistic regression classifiers 
+on samples of input texts and output texts for each style transfer model.
+
+Via adversarial evaluation, the classifiers must distinguish human-generated inputs from machine-generated outputs. 
+The more natural an output is, the likelier it is to fool an adversarial classifier.
+
+We calculate percent agreement with human judgments. Both classifiers show greater agreement on which texts are
+considered more natural with humans given relative scoring tasks than with those given absolute scoring tasks.
+
+
+'''
+import joblib
+import re
+from keras.models import load_model as load_keras_model
+from keras.preprocessing.sequence import pad_sequences
+from nlgeval.naturalness.tokenizer import *
+
+RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')',
+                        re.UNICODE)
+CLASSIFIER_BASE_PATH = '../models/naturalness_classifiers'
+# adjust vocabulary to account for unknowns
+def load_model(path):
+    return joblib.load(path)
+def invert_dict(dictionary):
+    return dict(zip(dictionary.values(), dictionary.keys()))
+
+MAX_SEQ_LEN = 30 # for neural classif
+TEXT_VECTORIZER = load_model('../models/vectorizer.pkl')
+VOCABULARY = TEXT_VECTORIZER.vocabulary_
+INVERSE_VOCABULARY = invert_dict(VOCABULARY)
+VOCABULARY[INVERSE_VOCABULARY[0]] = len(VOCABULARY)
+VOCABULARY['CUSTOM_UNKNOWN'] = len(VOCABULARY)+1
+
+class NaturalnessClassifier():
+    def __init__(self, style_transfer_model_name):
+        self.path = f'{CLASSIFIER_BASE_PATH}/neural_{style_transfer_model_name}.h5'
+        self.classifier = load_keras_model(self.path)
+
+    def score(self, texts):
+        print(texts)
+        inps = format_inputs(texts)
+        distribution = self.classifier.predict(inps)
+        scores = distribution.squeeze()
+        return scores
+
+def format_inputs(texts):
+
+    # prepare texts for use in neural classifier
+    texts_as_indices = []
+    for text in texts:
+        convert_to_indices(text)
+        texts_as_indices.append(convert_to_indices(text))
+    return pad_sequences(texts_as_indices, maxlen=MAX_SEQ_LEN, padding='post', truncating='post', value=0.)
+
+def convert_to_indices(text):
+    # tokenize input text
+    tokens = re.compile(RE_PATTERN).split(text)    
+    non_empty_tokens = list(filter(lambda token: token, tokens))    
+    indices = []
+    # collect indices of tokens in vocabulary
+    for token in non_empty_tokens:
+        if token in VOCABULARY:
+            index = VOCABULARY[token]
+        else:
+            index = VOCABULARY['CUSTOM_UNKNOWN']
+        indices.append(index)
+    return indices
diff --git a/nlgeval/naturalness/tokenizer.py b/nlgeval/naturalness/tokenizer.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+'''
+This tokenizer script is used to tokenize inputs fed to the NaturalnessClassifier
+Adapted from https://github.com/bfelbo/DeepMoji/blob/master/deepmoji/tokenizer.py
+
+Splits up a Unicode string into a list of tokens.
+Recognises:
+- Abbreviations
+- URLs
+- Emails
+- #hashtags
+- @mentions
+- emojis
+- emoticons (limited support)
+
+Multiple consecutive symbols are also treated as a single token.
+
+'''
+
+import re
+
+# Basic patterns.
+RE_NUM = r'[0-9]+'
+RE_WORD = r'[a-zA-Z]+'
+RE_WHITESPACE = r'\s+'
+RE_ANY = r'.'
+
+# Combined words such as 'red-haired' or 'CUSTOM_TOKEN'
+RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+'
+
+# English-specific patterns
+RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD
+
+TITLES = [
+    r'Mr\.',
+    r'Ms\.',
+    r'Mrs\.',
+    r'Dr\.',
+    r'Prof\.',
+]
+# Ensure case insensitivity
+RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES])
+
+# Symbols have to be created as separate patterns in order to match consecutive
+# identical symbols.
+SYMBOLS = r'()<!?.,/\'\"-_=\\§|´ˇ°[]<>{}~$^&*;:%+\xa3€`'
+RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS])
+
+# Hash symbols and at symbols have to be defined separately in order to not
+# clash with hashtags and mentions if there are multiple - i.e.
+# ##hello -> ['#', '#hello'] instead of ['##', 'hello']
+SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+'
+RE_SYMBOL += SPECIAL_SYMBOLS
+
+RE_ABBREVIATIONS = r'\b(?<!\.)(?:[A-Za-z]\.){2,}'
+
+# Twitter-specific patterns
+RE_HASHTAG = r'#[a-zA-Z0-9_]+'
+RE_MENTION = r'@[a-zA-Z0-9_]+'
+
+RE_URL = r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+RE_EMAIL = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b'
+
+# Emoticons and emojis
+RE_HEART = r'(?:<+/?3+)+'
+EMOTICONS_START = [
+    r'>:',
+    r':',
+    r'=',
+    r';',
+]
+EMOTICONS_MID = [
+    r'-',
+    r',',
+    r'^',
+    u'\'',
+    u'\"',
+]
+EMOTICONS_END = [
+    r'D',
+    r'd',
+    r'p',
+    r'P',
+    r'v',
+    r')',
+    r'o',
+    r'O',
+    r'(',
+    r'3',
+    r'/',
+    r'|',
+    u'\\',
+]
+EMOTICONS_EXTRA = [
+    r'-_-',
+    r'x_x',
+    r'^_^',
+    r'o.o',
+    r'o_o',
+    r'(:',
+    r'):',
+    r');',
+    r'(;',
+]
+
+RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA])
+for s in EMOTICONS_START:
+    for m in EMOTICONS_MID:
+        for e in EMOTICONS_END:
+            RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e))
+
+# requires ucs4 in python2.7 or python3+
+# RE_EMOJI = r"""[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]"""
+# safe for all python
+RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]"""
+
+# List of matched token patterns, ordered from most specific to least specific.
+TOKENS = [
+    RE_URL,
+    RE_EMAIL,
+    RE_COMB,
+    RE_HASHTAG,
+    RE_MENTION,
+    RE_HEART,
+    RE_EMOTICON,
+    RE_CONTRACTIONS,
+    RE_TITLES,
+    RE_ABBREVIATIONS,
+    RE_NUM,
+    RE_WORD,
+    RE_SYMBOL,
+    RE_EMOJI,
+    RE_ANY
+]
+
+# List of ignored token patterns
+IGNORED = [
+    RE_WHITESPACE
+]
+
+# Final pattern
+RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')',
+                        re.UNICODE)
+
+def tokenize(text):
+    # split an input string into a list of tokens (removing the empty strings)
+    return [token for token in RE_PATTERN.findall(text) if token and token.strip()]