'''
This script creates a .csv file containing three columns: response_id,
bias_word, and gullibility_word. bias_word and gullibility_word take 
value 1 if the openendedmotives response contains a bias stem or a gullibility 
stem, respectively.
'''

from nltk.stem import SnowballStemmer
import pandas as pd
import pdb
import json

stemmer = SnowballStemmer(language='english')

# Replace with deidentified main exp data
data_path = 'data/raw/receiver-pilot-deidentified.dta' 

# Replace with main exp output path
output_path = 'data/working/receiver-pilot-motives.csv' 

def _custom_stem(word):
	'''
	NLTK's Snowball Stemmer leaves "racist" and "racism" unchanged. We 
	modify the stemmer to stem both terms to "racis". 
	'''

	if 'racis' in word:
		return 'racis'
	else:
		return stemmer.stem(word)

def _get_synonyms():
	'''
	Synonyms drawn from www.thesaurus.com on 2019-02-17. All "most relevant" synonyms 
	(and only "most relevant" synonyms), as marked on the webpage, are included. We
	stem all synonyms so that we capture different parts of speech.
	'''

	with open('data/raw/synonyms.json', 'r') as f:
  		synonyms = json.load(f)
	
	stemmed_synonyms = {}
	for dimension in ['bias','gullibility']:
		stemmed_synonyms[dimension] = {}
		for base_word, synonym_list in synonyms[dimension].items():
			base_word_stemmed = _custom_stem(base_word) if base_word != 'racist' else 'racis'
			synonyms_stemmed = [_custom_stem(synonym) if synonym != 'racist' else 'racis' for synonym in synonym_list]
			stemmed_synonyms[dimension][base_word_stemmed] = synonyms_stemmed
	return stemmed_synonyms

def main():
	synonyms = _get_synonyms()

	bias_words = set([word for word in synonyms['bias'].keys()])
	bias_words_extended = set([word for wordlist in synonyms['bias'].values() \
								for word in wordlist]) | bias_words
	gullibility_words = set([word for word in synonyms['gullibility'].keys()])
	gullibility_words_extended = set([word for wordlist in synonyms['gullibility'].values() \
										for word in wordlist]) | gullibility_words

	data = pd.read_stata(data_path)

	data['bias_word'] = data['openendedmotives'].str.contains('|'.join(bias_words_extended), case=False)
	data['gullibility_word'] = data['openendedmotives'].str.contains('|'.join(gullibility_words_extended), case=False)

	word_cols = ['bias_word', 'gullibility_word']
	data = data[['responseid'] + word_cols]
	for col in word_cols:
		data[col] = data[col].astype(int)

	data.to_csv(output_path, index=False)

main()