# choosing the right step size is more of an art than a science. 
#
# Popular options include:
#
#   (1) Using a fixed step size
#   (2) Gradually shrinking the step size over time
#   (3) At each step, choosing the step size that minimizes 
#       the value of the objective function (very computation intensive)

step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

# It is possible that certain step sizes will result in invalid inputs 
# for our function. 
# So we’ll need to create a “safe apply” function that returns infinity 
# (which should never be the minimum of anything) for invalid inputs:

# safe(f):
#
#    Input: function f
#    Output: safe function that calls f

def safe(f):
    """return a new function that's the same as f,
       except that it outputs infinity whenever f produces an error
    """

    def safe_f(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except:
            return float('inf') # this means "infinity" in Python

    return safe_f

# NOTE:
#
# You call  safe(f) with:
#
#     f = some_function		# f = address of a function
#     f = safe(f)		# We PASS f to safe (some FIXES the address
#  				# and OBTAINS the address of a NEW function
#				# which is safe_f !!!
#				# (It seems like the Python can remember
#				# the address PASSED to a prior invocation...
#

# ###################################################
# step(v, dir, step_size):
#
#      Return v' by starting at v and go "step_size"
#      in direction "dir"
#
#      v is a vector
#      dir is the gradient

def step(v, direction, step_size):
    """move step_size in the direction from v"""
    return [v_i + step_size * direction_i
            for v_i, direction_i in zip(v, direction)]

# ############################################################
# How to use the "optimal step size" method

def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
    """
    use gradient descent to find theta that minimizes target function
    """

    step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

    theta = theta_0 			# set theta to initial value
    target_fn = safe(target_fn) 	# safe version of target_fn

    value = target_fn(theta) 		# Function value at theta

    while True:
        gradient = gradient_fn(theta)

        # Compute all thetas with given step size
        next_thetas = [step(theta, gradient, -step_size)
                       for step_size in step_sizes]

        # choose the one that minimizes the error function
        next_theta = min(next_thetas, key=target_fn)
#                                     ^^^^^^^^^^^^^
#                                     VERY sneaky way to compute min
#                                     over all function values !!!
#				      See: 00-min.py
        next_value = target_fn(next_theta)

        # stop if we're "converging"
        if abs(value - next_value) < tolerance:
            return theta
        else:
            theta, value = next_theta, next_value

# ###################################################################
# How to use the "safe" gradient descent with optimal step size:

def sum_of_squares(v):
    """computes the sum of squared elements in v"""
    return sum(v_i ** 2 for v_i in v)

def sum_of_squares_gradient(v):
    return [2 * v_i for v_i in v]


v = [2,6,9,8]
min_v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)

print(v, min_v)