Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added models/naturalness_classifiers/neural_ARAE.h5
Binary file not shown.
Binary file added models/naturalness_classifiers/neural_CAAE.h5
Binary file not shown.
Binary file added models/naturalness_classifiers/neural_DAR.h5
Binary file not shown.
Binary file added models/vectorizer.pkl
Binary file not shown.
52 changes: 45 additions & 7 deletions nlgeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,25 @@
from nlgeval.pycocoevalcap.cider.cider import Cider
from nlgeval.pycocoevalcap.meteor.meteor import Meteor
from nlgeval.pycocoevalcap.rouge.rouge import Rouge

from nlgeval.naturalness.naturalness import NaturalnessClassifier

# str/unicode stripping in Python 2 and 3 instead of `str.strip`.
def _strip(s):
return s.strip()

def get_sents(paths):
sents = []
for path in paths:
with open(path, 'r') as f:
for line in f:
sents.append(clean_line(line))
return sents

def clean_line(line):
line = line.lower().replace(',','').replace('.','').replace('\'', '').replace('!','').replace(')','').replace('(','').replace(';','').replace('-', ' ').replace('/', ' ').replace('%','').replace('?','').replace('"', '')
return line

def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False, not_naturalness=False, naturalness_model='CAAE'):
with open(hypothesis, 'r') as f:
hyp_list = f.readlines()
ref_list = []
Expand Down Expand Up @@ -80,10 +91,17 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
value = float(value.strip())
ret_scores[name] = value

if not not_naturalness:
neural_classifier = NaturalnessClassifier(naturalness_model)
hyp = get_sents(references)
scores = neural_classifier.score(hyp).mean()
ret_scores['Naturalness'] = scores
print("Mean Naturalness of references: {}".format(ret_scores['Naturalness']))

return ret_scores


def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False):
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False, not_naturalness=False, naturalness_model='CAAE'):
assert isinstance(hyp, six.string_types)

if isinstance(ref, six.string_types):
Expand Down Expand Up @@ -141,6 +159,11 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
name, value = score.split(':')
value = float(value.strip())
ret_scores[name] = value

if not not_naturalness:
neural_classifier = NaturalnessClassifier(naturalness_model)
scores = neural_classifier.score(ref).mean()
ret_scores['Naturalness'] = scores

return ret_scores

Expand All @@ -163,8 +186,7 @@ class NLGEval(object):
'SkipThoughtCS',
} | glove_metrics

def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
metrics_to_omit=None):
def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False, no_naturalness=False, naturalness_model='CAAE', metrics_to_omit=None):
"""
:param no_overlap: Default: Use overlap metrics.
`True` if these metrics should not be used.
Expand Down Expand Up @@ -194,6 +216,9 @@ def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
"Invalid metrics to omit: {}".format(self.metrics_to_omit - self.valid_metrics)

self.no_overlap = no_overlap
self.no_naturalness = no_naturalness
self.naturalness_model = naturalness_model

if not no_overlap:
self.load_scorers()

Expand Down Expand Up @@ -282,6 +307,11 @@ def compute_individual_metrics(self, ref, hyp):
name, value = score.split(':')
value = float(value.strip())
ret_scores[name] = value

if not self.no_naturalness:
neural_classifier = NaturalnessClassifier(self.naturalness_model)
scores = neural_classifier.score(ref).mean()
ret_scores['Naturalness'] = scores

return ret_scores

Expand Down Expand Up @@ -319,5 +349,13 @@ def compute_metrics(self, ref_list, hyp_list):
name, value = score.split(':')
value = float(value.strip())
ret_scores[name] = value

return ret_scores

if not self.no_naturalness:
neural_classifier = NaturalnessClassifier(self.naturalness_model)
all_refs = []
for ref in ref_list:
all_refs.extend(ref)
scores = neural_classifier.score(all_refs).mean()
ret_scores['Naturalness'] = scores

return ret_scores
Empty file added nlgeval/naturalness/__init__.py
Empty file.
85 changes: 85 additions & 0 deletions nlgeval/naturalness/naturalness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@

'''
@article{Mir2019EvaluatingST,
title={Evaluating Style Transfer for Text},
author={Remi Mir and Bjarke Felbo and Nick Obradovich and Iyad Rahwan},
journal={ArXiv},
year={2019},
volume={abs/1904.02295}
}

Implementation and Pretrained Classifier models from the repo: https://github.com/passeul/style-transfer-model-evaluation

This code can be used to evaluate the naturalness of output sentiment texts of examined style transfer models.

For a baseline understanding of what is considered "natural," any method used for automated evaluation of naturalness
also requires an understanding of the human-sourced input texts.

Inspired by the adversarial evaluation approach in "Generating Sentences from a Continuous Space"
(Bowman et al., 2016), we trained unigram logistic regression classifiers and LSTM logistic regression classifiers
on samples of input texts and output texts for each style transfer model.

Via adversarial evaluation, the classifiers must distinguish human-generated inputs from machine-generated outputs.
The more natural an output is, the likelier it is to fool an adversarial classifier.

We calculate percent agreement with human judgments. Both classifiers show greater agreement on which texts are
considered more natural with humans given relative scoring tasks than with those given absolute scoring tasks.


'''
import joblib
import re
from keras.models import load_model as load_keras_model
from keras.preprocessing.sequence import pad_sequences
from nlgeval.naturalness.tokenizer import *

RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')',
re.UNICODE)
CLASSIFIER_BASE_PATH = '../models/naturalness_classifiers'
# adjust vocabulary to account for unknowns
def load_model(path):
return joblib.load(path)
def invert_dict(dictionary):
return dict(zip(dictionary.values(), dictionary.keys()))

MAX_SEQ_LEN = 30 # for neural classif
TEXT_VECTORIZER = load_model('../models/vectorizer.pkl')
VOCABULARY = TEXT_VECTORIZER.vocabulary_
INVERSE_VOCABULARY = invert_dict(VOCABULARY)
VOCABULARY[INVERSE_VOCABULARY[0]] = len(VOCABULARY)
VOCABULARY['CUSTOM_UNKNOWN'] = len(VOCABULARY)+1

class NaturalnessClassifier():
def __init__(self, style_transfer_model_name):
self.path = f'{CLASSIFIER_BASE_PATH}/neural_{style_transfer_model_name}.h5'
self.classifier = load_keras_model(self.path)

def score(self, texts):
print(texts)
inps = format_inputs(texts)
distribution = self.classifier.predict(inps)
scores = distribution.squeeze()
return scores

def format_inputs(texts):

# prepare texts for use in neural classifier
texts_as_indices = []
for text in texts:
convert_to_indices(text)
texts_as_indices.append(convert_to_indices(text))
return pad_sequences(texts_as_indices, maxlen=MAX_SEQ_LEN, padding='post', truncating='post', value=0.)

def convert_to_indices(text):
# tokenize input text
tokens = re.compile(RE_PATTERN).split(text)
non_empty_tokens = list(filter(lambda token: token, tokens))
indices = []
# collect indices of tokens in vocabulary
for token in non_empty_tokens:
if token in VOCABULARY:
index = VOCABULARY[token]
else:
index = VOCABULARY['CUSTOM_UNKNOWN']
indices.append(index)
return indices
147 changes: 147 additions & 0 deletions nlgeval/naturalness/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: utf-8 -*-
'''
This tokenizer script is used to tokenize inputs fed to the NaturalnessClassifier
Adapted from https://github.com/bfelbo/DeepMoji/blob/master/deepmoji/tokenizer.py

Splits up a Unicode string into a list of tokens.
Recognises:
- Abbreviations
- URLs
- Emails
- #hashtags
- @mentions
- emojis
- emoticons (limited support)

Multiple consecutive symbols are also treated as a single token.

'''

import re

# Basic patterns.
RE_NUM = r'[0-9]+'
RE_WORD = r'[a-zA-Z]+'
RE_WHITESPACE = r'\s+'
RE_ANY = r'.'

# Combined words such as 'red-haired' or 'CUSTOM_TOKEN'
RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+'

# English-specific patterns
RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD

TITLES = [
r'Mr\.',
r'Ms\.',
r'Mrs\.',
r'Dr\.',
r'Prof\.',
]
# Ensure case insensitivity
RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES])

# Symbols have to be created as separate patterns in order to match consecutive
# identical symbols.
SYMBOLS = r'()<!?.,/\'\"-_=\\§|´ˇ°[]<>{}~$^&*;:%+\xa3€`'
RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS])

# Hash symbols and at symbols have to be defined separately in order to not
# clash with hashtags and mentions if there are multiple - i.e.
# ##hello -> ['#', '#hello'] instead of ['##', 'hello']
SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+'
RE_SYMBOL += SPECIAL_SYMBOLS

RE_ABBREVIATIONS = r'\b(?<!\.)(?:[A-Za-z]\.){2,}'

# Twitter-specific patterns
RE_HASHTAG = r'#[a-zA-Z0-9_]+'
RE_MENTION = r'@[a-zA-Z0-9_]+'

RE_URL = r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
RE_EMAIL = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b'

# Emoticons and emojis
RE_HEART = r'(?:<+/?3+)+'
EMOTICONS_START = [
r'>:',
r':',
r'=',
r';',
]
EMOTICONS_MID = [
r'-',
r',',
r'^',
u'\'',
u'\"',
]
EMOTICONS_END = [
r'D',
r'd',
r'p',
r'P',
r'v',
r')',
r'o',
r'O',
r'(',
r'3',
r'/',
r'|',
u'\\',
]
EMOTICONS_EXTRA = [
r'-_-',
r'x_x',
r'^_^',
r'o.o',
r'o_o',
r'(:',
r'):',
r');',
r'(;',
]

RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA])
for s in EMOTICONS_START:
for m in EMOTICONS_MID:
for e in EMOTICONS_END:
RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e))

# requires ucs4 in python2.7 or python3+
# RE_EMOJI = r"""[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]"""
# safe for all python
RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]"""

# List of matched token patterns, ordered from most specific to least specific.
TOKENS = [
RE_URL,
RE_EMAIL,
RE_COMB,
RE_HASHTAG,
RE_MENTION,
RE_HEART,
RE_EMOTICON,
RE_CONTRACTIONS,
RE_TITLES,
RE_ABBREVIATIONS,
RE_NUM,
RE_WORD,
RE_SYMBOL,
RE_EMOJI,
RE_ANY
]

# List of ignored token patterns
IGNORED = [
RE_WHITESPACE
]

# Final pattern
RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')',
re.UNICODE)

def tokenize(text):
# split an input string into a list of tokens (removing the empty strings)
return [token for token in RE_PATTERN.findall(text) if token and token.strip()]
Loading