In [18]:
from mxnet import autograd, np, npx

npx.set_np()

In [19]:
x = np.arange(4.0)
x

# Computing derivative function of y = 2*x^T*x

array([0., 1., 2., 3.])

In [20]:
np.dot(x,x)
# y = 2(x0^2 + x1^2 + x2^2 + x3^2)      (y is a SCALAR)
# dy/dx0 = 4 x0
# dy/dx1 = 4 x1
# dy/dx2 = 4 x2
# dy/dx3 = 4 x3

array(14.)

In [21]:
# We allocate memory for a tensor's gradient by invoking `attach_grad`
x.attach_grad()

# After we calculate a gradient taken with respect to `x`, we will be able to
# access it via the `grad` attribute, whose values are initialized with 0s
x.grad

array([0., 0., 0., 0.])

In [22]:
# Place our code inside an `autograd.record` scope to build the computational
# graph
with autograd.record():
    y = 2 * np.dot(x, x)
y

array(28.)

In [23]:
# we can automatically calculate the gradient of y with respect to 
# each component of x by calling the function for backpropagation:
y.backward()

# Then print the print the gradient
x.grad

# y = 2 * (x0^2 + x1^2 + x2^2 + x3^2)
# dy/dx0 = 4 x0
# dy/dx1 = 4 x1
# dy/dx2 = 4 x2
# dy/dx3 = 4 x3
# x = [0,1,2,3]
# So: grad(x) = [0,4,8,12]

array([ 0.,  4.,  8., 12.])

In [24]:
# Example 2: y = x0 + x1 + x2 + x3           (y is a SCALAR)
# dy/dx0 = 1
# dy/dx1 = 1
# dy/dx2 = 1
# dy/dx3 = 1
# x = [0,1,2,3]

with autograd.record():
    y = x.sum()
    
y.backward()
x.grad # Overwritten by the newly calculated gradient

array([1., 1., 1., 1.])

In [25]:
# What does "backward()" do if the function y=f(x) is a vector value ?
# Technically, when y is not a scalar, the most natural interpretation of 
# the differentiation of a vector y with respect to a vector x is a matrix. 
# For higher-order and higher-dimensional y and x, the
# differentiation result could be a high-order tensor.
#
# In practice: we are calling backward on a vector fuction, we are trying 
# to calculate the derivatives of the loss functions.
# Our intent is not to calculate the differentiation matrix but rather:
#       the sum of the partial derivatives
# Therefore: if y=f() is a vector, mxnet treats it as:   f().sum()
#
# When we invoke `backward` on a vector-valued variable `y` (function of `x`),
# a new scalar variable is created by summing the elements in `y`. Then the
# gradient of that scalar variable with respect to `x` is computed

with autograd.record():
    y = x*x        # y = (x0^2, x1^2, x2^2, x3^2) ==> x0^2 + x1^2 + x2^2 + x3^2

# y' = (dy/dx0, dy/dx1, dy/dx2, dy/dx3) = (2*x0, 2*x1, 2*x2, 2*x3)
y.backward()
x.grad           # Equals to y = sum(x * x)

array([0., 2., 4., 6.])

In [30]:
# Detaching Computation
# Suppose:  y = f(x)
#           z = g(x,y)
# Normally: dz/dx will use the chain rule...
# Suppose we wanted to calculate the gradient of z with respect
# to x, but wanted for some reason to treat y as a constant
# The "detached" compuation can be used to compute the derivative:

with autograd.record():
    y = x * x         # x=[0,1,2,3], y = x*x = [0,1,4,9]
    u = y.detach()    # u is now a constant
    z = u * x         # dz/dx = u !!!
z.backward()
x.grad

array([0., 1., 4., 9.])

In [31]:
# When y is detached, the computation of y WILL be recorded ALSO
# Therefore, we can compute the backpropagation on y:

y.backward()
x.grad

array([0., 2., 4., 6.])

In [32]:
# Without detaching:

with autograd.record():
    y = x * x         # x=[0,1,2,3], y = x*x = [0,1,4,9]
    z = y * x         # z = x*x*x = (x0^3, x1^3, x2^3, x3^3) ===> x0^3 + x1^3 + x2^3 + x3^3
z.backward()          # z' = (3*x0^2, 3*x1^2, 3*x2^2, 3^x3^2)
x.grad                #    = (3*0 + 3*1 + 3*4 + 3*9)

array([ 0.,  3., 12., 27.])

In [33]:
# *** SKIP THIS *** Useless diversion


# Studying a weird functio
# Here is a weird function:

def f(a):
    b = a * 2
    
    while np.linalg.norm(b) < 1000:
        b = b * 2
        
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c

# This function is PIECEWISE linear !!!

In [49]:
x=np.array([2.01])
f(x)

array([1029.12])

In [42]:
a = np.random.normal()
f(a)

array(1585.1598)

In [52]:
# Compute f'(a): the gradient at a

a.attach_grad()
with autograd.record():
    d = f(a)
d.backward()     # Because function is piecewise linear, d is the gradient !!
d/a

array(2048.)

In [51]:
a.grad

array(2048.)