'''
Author: Carl Yang
Function: A separate program that generates pairs based on
random walk paths on a place network
Command: buck run @mode/dev-nosan //experimental/carlyang/place_embedding:pair_gen
'''
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import defaultdict
import random
import pandas as pd
from analytics.bamboo import Bamboo
from datetime import datetime

n_place = 3839755
k = 10
ratio_category = 0.2
ratio_closeby = 0.2
n_path = 5
length = 5
n_pair = 10
negative = 2

df = Bamboo().read_hive_table(
    namespace='entities',
    table='daiquery_246594546122201',
    column_names=['page_id', 'latitude', 'longitude', 'category'],
    batch_size=n_place
)
print('{}: Finished reading data.'.format(datetime.now()))

category = {}
category_reverse = defaultdict(list)
coordinate = {}
coordinate_reverse = defaultdict(list)

for i, row in df.iterrows():
    if i % 10000 == 0:
        print('{}: Processing place {}.'.format(datetime.now(), i))
    if row.category is not None:
        category[row.page_id] = row.category
        category_reverse[row.category].append(row.page_id)
    coo = str(int(round(row.latitude * 1000))) + \
        ',' + str(int(round(row.longitude * 1000)))
    coordinate[row.page_id] = coo
    coordinate_reverse[coo].append(row.page_id)

pair = {}
pair['page1_id'] = []
pair['page2_id'] = []
pair['label'] = []
pair['set_name'] = []

print('{}: Generating random walks and pairs.'.format(datetime.now()))
for _ in range(n_path * n_place):
    seq = []
    start = random.choice(df.page_id)
    seq.append(start)
    cur = start
    for _ in range(length):
        if random.random() < ratio_category and cur in category:
            cat = category[cur]
            cur = random.choice(category_reverse[cat])
        else:
            coo = coordinate[cur]
            if random.random() < ratio_closeby:
                x = random.choice([-1, 1])
                y = random.choice([-1, 1])
                coo_closeby = str(int(coo.split(',')[0]) + x) + ',' +\
                    str(int(coo.split(',')[1]) + y)
                if coo_closeby in coordinate_reverse:
                    coo = coo_closeby
            cur = random.choice(coordinate_reverse[coo])
        seq.append(cur)
    for _ in range(n_pair):
        p1 = random.choice(seq)
        p2 = random.choice(seq)
        if p1 != p2:
            pair['page1_id'].append(p1)
            pair['page2_id'].append(p2)
            pair['label'].append(True)
            pair['set_name'].append('sample')
            for _ in range(negative):
                p3 = random.choice(category.keys())
                pair['page1_id'].append(p1)
                pair['page2_id'].append(p3)
                pair['label'].append(False)
                pair['set_name'].append('sample')

print('{}: Writing hive table.'.format(datetime.now()))
output = pd.DataFrame(data=pair)
res = Bamboo().upload_data_to_hive(
    namespace='entities',
    table='tmp_carl_random_walk_pairs',
    df=output,
    retention=1
)
