Federated Newton Learn

In this section we present the algorithm named Federated Newton Learning (FEDNL) introduced in [2]. The following table contains information regarding the convergence of different FedNL flavours in particular the error column contains \(\mathcal{L}(x_{50})-\mathcal{L}(x^*)\).

Flavour

tol

iteration

Error

Vanilla Newton

1e-4

4

0.0

Rank 1 Compression

1e-4

10

1.4901161193847656e-07

Rank 1 Compression Diagonal Regularisation \(\newline\) Identity Initial Hessian

1e-4

>50

0.1078076362609863

Rank 1 Compression Diagonal Regularisation \(\newline\) Null Initial Hessian

1e-4

>50

8.705258369445801e-05

[1]:
from ipyparallel import Client
c = Client()
c.ids
[1]:
[0, 1]

Vanilla Newton

First we reimplement the vanilla Newton method in the framework of FEDNL.

[8]:
%%px
import tensorflow as tf
import numpy as np
import scipy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt
from numsa.TFHessian import *
import dsdl

comm = MPI.COMM_WORLD

ds = dsdl.load("a1a")

X, Y = ds.get_train()
indx = np.array_split(range(X.shape[0]),int(comm.Get_size()));
tfX = []
tfY = []
for k in range(len(indx)):
    tfX = tfX + [tf.sparse.from_dense(np.array(X[indx[comm.Get_rank()]].todense(), dtype=np.float32))]
    tfY = tfY + [tf.convert_to_tensor(np.array(Y[indx[comm.Get_rank()]], dtype=np.float32).reshape(X[indx[comm.Get_rank()]].shape[0], 1))]

tfXs = tf.sparse.from_dense(np.array(X.todense(), dtype=np.float32))
tfYs = tf.convert_to_tensor(np.array(Y, dtype=np.float32).reshape(X.shape[0], 1))
#Defining the Loss Function
def LossSerial(x):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfXs, x, adjoint_a=False)
    Z = tf.math.multiply(tfYs, Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfXs.shape[0]) + lam*tf.norm(x)**2

    return S
#Defining the Loss Function
def Loss(x,comm):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfX[comm.Get_rank()], x, adjoint_a=False)
    Z = tf.math.multiply(tfY[comm.Get_rank()], Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfX[comm.Get_rank()].shape[0]) + lam*tf.norm(x)**2
    return S
################! Setting Of The Solver!##################
itmax = 50
tol = 1e-4;
step_size=1;
###########################################################
x = tf.Variable(0.1*np.ones((119,1),dtype=np.float32))

Res = [];

H = Hessian(Loss,x);
H.shift(x)#,start=0*np.identity(x.numpy().shape[0])) #We initialize the shifter
#We now collect and average the loc Hessians in the master node (rk 0)
Hs = H.comm.gather(H.memH, root=0);
if H.comm.Get_rank()==0:
    Hm = (1/len(Hs))*np.sum(Hs,0);
else:
    Hm = None
print("The master Hessian has been initialised")
for it in tqdm(range(itmax)):
    # Obtaining the compression of the difference between local mat
    # and next local mat.
    U,sigma,Vt,ell = H.shift(x,{"comp":MatSVDCompDiag,"rk":119,"type":"mat"});
    shift = Vt.transpose()@np.diag(sigma)@U.transpose();
    #print("Updating local Hessian")
    H.memH = H.memH+step_size*shift;
    grad = H.grad().numpy();
    #Now we update the master Hessian and perform the Newton method step
    Shifts = H.comm.gather(shift, root=0);
    Grads = H.comm.gather(grad, root=0);
    Ells = H.comm.gather(ell, root=0);
    if H.comm.Get_rank() == 0:
        #print("Computing the avarage of the local shifts and grad ...")
        Shift = (1/len(Shifts))*np.sum(Shifts,0);
        Grad = (1/len(Grads))*np.sum(Grads,0);
        Ell = (1/len(Ells))*np.sum(Ells,0);
        res = np.linalg.norm(Grad);
        Res = Res + [res]
        #print("Computing the master Hessian ...")
        Hm = Hm + step_size*Shift;
        #print("Searching new search direction ...")
        A = Hm; #A = Hm + Ell*np.identity(Hm.shape[0]);
        q = np.linalg.solve(A,Grad);
        #print("Found search dir, ",q);
        if it%25 == 0:
            print("(FedNL) [Iteration. {}] Lost funciton at this iteration {}  and gradient norm {}".format(it,LossSerial(x),np.linalg.norm(Grad)));
        x = x - tf.Variable(q,dtype=np.float32);
        x =  tf.Variable(x)
    else:
        res = None
    #Distributing the search direction
    x = H.comm.bcast(x,root=0)
    res = H.comm.bcast(res,root=0)
    if res<tol:
            break
LossStar =  0.33691510558128357;
print("Lost funciton at this iteration {}, gradient norm {} and error {}.".format(LossSerial(x),np.linalg.norm(grad),abs(LossSerial(x)-LossStar)))
[stderr:1]   8%|▊         | 4/50 [00:18<03:30,  4.57s/it]

[stderr:0]   8%|▊         | 4/50 [00:18<03:30,  4.57s/it]

[stdout:1] The master Hessian has been initialised
Lost funciton at this iteration 0.33691510558128357, gradient norm 0.020152254030108452 and error 0.0.

[stdout:0] The master Hessian has been initialised
(FedNL) [Iteration. 0] Lost funciton at this iteration 1.2675940990447998  and gradient norm 1.388305902481079
Lost funciton at this iteration 0.33691510558128357, gradient norm 0.02014993503689766 and error 0.0.

[10]:
import matplotlib.pyplot as plt
Rs = c[:]["Res"][0]
plt.semilogy(range(len(Rs)),Rs)
plt.title("Residual Decay")
[10]:
Text(0.5, 1.0, 'Residual Decay')
../_images/TensorFlow_FEDNL_4_1.png

FEDNL Rank 1 Compression

[2]:
%%px
import tensorflow as tf
import numpy as np
import scipy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt
from numsa.TFHessian import *
import dsdl

comm = MPI.COMM_WORLD

ds = dsdl.load("a1a")

X, Y = ds.get_train()
indx = np.array_split(range(X.shape[0]),int(comm.Get_size()));
tfX = []
tfY = []
for k in range(len(indx)):
    tfX = tfX + [tf.sparse.from_dense(np.array(X[indx[comm.Get_rank()]].todense(), dtype=np.float32))]
    tfY = tfY + [tf.convert_to_tensor(np.array(Y[indx[comm.Get_rank()]], dtype=np.float32).reshape(X[indx[comm.Get_rank()]].shape[0], 1))]

tfXs = tf.sparse.from_dense(np.array(X.todense(), dtype=np.float32))
tfYs = tf.convert_to_tensor(np.array(Y, dtype=np.float32).reshape(X.shape[0], 1))
#Defining the Loss Function
def LossSerial(x):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfXs, x, adjoint_a=False)
    Z = tf.math.multiply(tfYs, Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfXs.shape[0]) + lam*tf.norm(x)**2

    return S
#Defining the Loss Function
def Loss(x,comm):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfX[comm.Get_rank()], x, adjoint_a=False)
    Z = tf.math.multiply(tfY[comm.Get_rank()], Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfX[comm.Get_rank()].shape[0]) + lam*tf.norm(x)**2
    return S
################! Setting Of The Solver!##################
itmax = 50
tol = 1e-4;
step_size=1;
###########################################################
x = tf.Variable(0.1*np.ones((119,1),dtype=np.float32))

Res = [];

H = Hessian(Loss,x);
H.shift(x)#,start=0*np.identity(x.numpy().shape[0])) #We initialize the shifter
#We now collect and average the loc Hessians in the master node (rk 0)
Hs = H.comm.gather(H.memH, root=0);
if H.comm.Get_rank()==0:
    Hm = (1/len(Hs))*np.sum(Hs,0);
else:
    Hm = None
print("The master Hessian has been initialised")
for it in tqdm(range(itmax)):
    # Obtaining the compression of the difference between local mat
    # and next local mat.
    U,sigma,Vt,ell = H.shift(x,{"comp":MatSVDCompDiag,"rk":1,"type":"mat"});
    shift = Vt.transpose()@np.diag(sigma)@U.transpose();
    #print("Updating local Hessian")
    H.memH = H.memH+step_size*shift;
    grad = H.grad().numpy();
    #Now we update the master Hessian and perform the Newton method step
    Shifts = H.comm.gather(shift, root=0);
    Grads = H.comm.gather(grad, root=0);
    Ells = H.comm.gather(ell, root=0);
    if H.comm.Get_rank() == 0:
        #print("Computing the avarage of the local shifts and grad ...")
        Shift = (1/len(Shifts))*np.sum(Shifts,0);
        Grad = (1/len(Grads))*np.sum(Grads,0);
        Ell = (1/len(Ells))*np.sum(Ells,0);
        res = np.linalg.norm(Grad);
        Res = Res + [res];
        #print("Computing the master Hessian ...")
        Hm = Hm + step_size*Shift;
        #print("Searching new search direction ...")
        A = Hm; #A = Hm + Ell*np.identity(Hm.shape[0]);
        q = np.linalg.solve(A,Grad);
        #print("Found search dir, ",q);
        if it%25 == 0:
            print("(FedNL) [Iteration. {}] Lost funciton at this iteration {}  and gradient norm {}".format(it,LossSerial(x),np.linalg.norm(Grad)));
        x = x - tf.Variable(q,dtype=np.float32);
        x =  tf.Variable(x)
    else:
        res = None
    #Distributing the search direction
    x = H.comm.bcast(x,root=0)
    res = H.comm.bcast(res,root=0)
    if res<tol:
            break
LossStar =  0.33691510558128357;
print("Lost funciton at this iteration {}, gradient norm {} and error {}.".format(LossSerial(x),np.linalg.norm(grad),abs(LossSerial(x)-LossStar)))
[stderr:0]  20%|██        | 10/50 [00:37<02:28,  3.72s/it]

[stderr:1]  20%|██        | 10/50 [00:37<02:28,  3.72s/it]

[stdout:1] The master Hessian has been initialised
Lost funciton at this iteration 0.3369152545928955, gradient norm 0.02014790289103985 and error 1.4901161193847656e-07.

[stdout:0] The master Hessian has been initialised
(FedNL) [Iteration. 0] Lost funciton at this iteration 1.2675940990447998  and gradient norm 1.388305902481079
Lost funciton at this iteration 0.3369152545928955, gradient norm 0.020166713744401932 and error 1.4901161193847656e-07.

[6]:
import matplotlib.pyplot as plt
Rs = c[:]["Res"][0]
plt.loglog(range(len(Rs)),Rs)
plt.title("Residual Decay")
[6]:
Text(0.5, 1.0, 'Residual Decay')
../_images/TensorFlow_FEDNL_7_1.png

FEDNL With Diagonal Regularisation And Different Initial Hessian

[5]:
%%px
import tensorflow as tf
import numpy as np
import scipy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt
from numsa.TFHessian import *
import dsdl

comm = MPI.COMM_WORLD

ds = dsdl.load("a1a")

X, Y = ds.get_train()
indx = np.array_split(range(X.shape[0]),int(comm.Get_size()));
tfX = []
tfY = []
for k in range(len(indx)):
    tfX = tfX + [tf.sparse.from_dense(np.array(X[indx[comm.Get_rank()]].todense(), dtype=np.float32))]
    tfY = tfY + [tf.convert_to_tensor(np.array(Y[indx[comm.Get_rank()]], dtype=np.float32).reshape(X[indx[comm.Get_rank()]].shape[0], 1))]

tfXs = tf.sparse.from_dense(np.array(X.todense(), dtype=np.float32))
tfYs = tf.convert_to_tensor(np.array(Y, dtype=np.float32).reshape(X.shape[0], 1))
#Defining the Loss Function
def LossSerial(x):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfXs, x, adjoint_a=False)
    Z = tf.math.multiply(tfYs, Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfXs.shape[0]) + lam*tf.norm(x)**2

    return S
#Defining the Loss Function
def Loss(x,comm):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfX[comm.Get_rank()], x, adjoint_a=False)
    Z = tf.math.multiply(tfY[comm.Get_rank()], Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfX[comm.Get_rank()].shape[0]) + lam*tf.norm(x)**2
    return S
################! Setting Of The Solver!##################
itmax = 50
tol = 1e-4;
step_size=1;
###########################################################
x = tf.Variable(0.1*np.ones((119,1),dtype=np.float32))

H = Hessian(Loss,x);
H.shift(x,start=np.eye(x.numpy().shape[0])) #We initialize the shifter
#We now collect and average the loc Hessians in the master node (rk 0)
Hs = H.comm.gather(H.memH, root=0);
if H.comm.Get_rank()==0:
    Hm = (1/len(Hs))*np.sum(Hs,0);
else:
    Hm = None
print("The master Hessian has been initialised")
for it in tqdm(range(itmax)):
    # Obtaining the compression of the difference between local mat
    # and next local mat.
    U,sigma,Vt,ell = H.shift(x,{"comp":MatSVDCompDiag,"rk":1,"type":"mat"});
    shift = Vt.transpose()@np.diag(sigma)@U.transpose();
    #print("Updating local Hessian")
    H.memH = H.memH+step_size*shift;
    grad = H.grad().numpy();
    #Now we update the master Hessian and perform the Newton method step
    Shifts = H.comm.gather(shift, root=0);
    Grads = H.comm.gather(grad, root=0);
    Ells = H.comm.gather(ell, root=0);
    if H.comm.Get_rank() == 0:
        #print("Computing the avarage of the local shifts and grad ...")
        Shift = (1/len(Shifts))*np.sum(Shifts,0);
        Grad = (1/len(Grads))*np.sum(Grads,0);
        Ell = (1/len(Ells))*np.sum(Ells,0);
        res = np.linalg.norm(Grad);
        #print("Computing the master Hessian ...")
        Hm = Hm + step_size*Shift;
        #print("Searching new search direction ...")
        A = Hm + Ell*np.identity(Hm.shape[0]);
        q = np.linalg.solve(A,Grad);
        #print("Found search dir, ",q);
        if it%25 == 0:
            print("(FedNL) [Iteration. {}] Lost funciton at this iteration {}  and gradient norm {}".format(it,LossSerial(x),np.linalg.norm(Grad)));
        x = x - tf.Variable(q,dtype=np.float32);
        x =  tf.Variable(x)
    else:
        res = None
    #Distributing the search direction
    x = H.comm.bcast(x,root=0)
    res = H.comm.bcast(res,root=0)
    if res<tol:
            break
LossStar =  0.33691510558128357;
print("Lost funciton at this iteration {}, gradient norm {} and error {}.".format(LossSerial(x),np.linalg.norm(grad),abs(LossSerial(x)-LossStar)))
[stderr:1] 100%|██████████| 50/50 [02:58<00:00,  3.58s/it]

[stderr:0] 100%|██████████| 50/50 [02:58<00:00,  3.58s/it]

[stdout:1] The master Hessian has been initialised
Lost funciton at this iteration 0.4447227418422699, gradient norm 0.1290498971939087 and error 0.10780763626098633.

[stdout:0] The master Hessian has been initialised
(FedNL) [Iteration. 0] Lost funciton at this iteration 1.2675940990447998  and gradient norm 1.388305902481079
(FedNL) [Iteration. 25] Lost funciton at this iteration 0.4930209219455719  and gradient norm 0.16765998303890228
Lost funciton at this iteration 0.4447227418422699, gradient norm 0.10724713653326035 and error 0.10780763626098633.

FEDNL LowRank + Diagonal

[3]:
from ipyparallel import Client
c = Client()
c.ids
[3]:
[0, 1]
[7]:
%%px
import tensorflow as tf
import numpy as np
import scipy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt
from numsa.TFHessian import *
import dsdl

comm = MPI.COMM_WORLD

ds = dsdl.load("a1a")

X, Y = ds.get_train()
indx = np.array_split(range(X.shape[0]),int(comm.Get_size()));
tfX = []
tfY = []
for k in range(len(indx)):
    tfX = tfX + [tf.sparse.from_dense(np.array(X[indx[comm.Get_rank()]].todense(), dtype=np.float32))]
    tfY = tfY + [tf.convert_to_tensor(np.array(Y[indx[comm.Get_rank()]], dtype=np.float32).reshape(X[indx[comm.Get_rank()]].shape[0], 1))]

tfXs = tf.sparse.from_dense(np.array(X.todense(), dtype=np.float32))
tfYs = tf.convert_to_tensor(np.array(Y, dtype=np.float32).reshape(X.shape[0], 1))
#Defining the Loss Function
def LossSerial(x):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfXs, x, adjoint_a=False)
    Z = tf.math.multiply(tfYs, Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfXs.shape[0]) + lam*tf.norm(x)**2

    return S
#Defining the Loss Function
def Loss(x,comm):
    lam = 1e-3; #Regularisation
    x = tf.reshape(x, (119, 1))
    Z = tf.sparse.sparse_dense_matmul(tfX[comm.Get_rank()], x, adjoint_a=False)
    Z = tf.math.multiply(tfY[comm.Get_rank()], Z)
    S = tf.reduce_sum(tf.math.log(1 + tf.math.exp(-Z)) / tfX[comm.Get_rank()].shape[0]) + lam*tf.norm(x)**2
    return S
################! Setting Of The Solver!##################
itmax = 50
tol = 1e-4;
step_size=1;
###########################################################
x = tf.Variable(0.1*np.ones((119,1),dtype=np.float32))

Res = [];
TBCHistory = [];

H = Hessian(Loss,x);
H.loc = True;
#We now collect and average the loc Hessians in the master node (rk 0)

print("Compressing Initial Hessian ")
cU, cS, cVt = MatSVDComp(H.mat()-np.diag(np.diag(H.mat())),30);
print("Assembling the the comp ...")
C =  cU@np.diag(cS)@cVt;
print("Built the compression ...")
A = np.diag(np.diag(H.mat()))+C;
H.shift(x,start=A) #We initialize the shifter
Hs = H.comm.gather(H.memH, root=0);
if H.comm.Get_rank()==0:
    Hm = (1/len(Hs))*np.sum(Hs,0);
else:
    Hm = None
print("The master Hessian has been initialised")
for it in tqdm(range(itmax)):
    # Obtaining the compression of the difference between local mat
    # and next local mat.
    U,sigma,Vt,ell = H.shift(x,{"comp":MatSVDCompDiag,"rk":1,"type":"mat"});
    shift = Vt.transpose()@np.diag(sigma)@U.transpose();
    TBCHistory = TBCHistory + [sigma[0]];
    #print("Updating local Hessian")
    H.memH = H.memH+step_size*shift;
    grad = H.grad().numpy();
    #Now we update the master Hessian and perform the Newton method step
    Shifts = H.comm.gather(shift, root=0);
    Grads = H.comm.gather(grad, root=0);
    Ells = H.comm.gather(ell, root=0);
    if H.comm.Get_rank() == 0:
        #print("Computing the avarage of the local shifts and grad ...")
        Shift = (1/len(Shifts))*np.sum(Shifts,0);
        Grad = (1/len(Grads))*np.sum(Grads,0);
        Ell = (1/len(Ells))*np.sum(Ells,0);
        res = np.linalg.norm(Grad);
        Res = Res + [res];
        #print("Computing the master Hessian ...")
        Hm = Hm + step_size*Shift;
        #print("Searching new search direction ...")
        A = Hm; #A = Hm + Ell*np.identity(Hm.shape[0]);
        q = np.linalg.solve(A,Grad);
        #print("Found search dir, ",q);
        if it%25 == 0:
            print("(FedNL) [Iteration. {}] Lost funciton at this iteration {}  and gradient norm {}".format(it,LossSerial(x),np.linalg.norm(Grad)));
        x = x - tf.Variable(q,dtype=np.float32);
        x =  tf.Variable(x)
    else:
        res = None
    #Distributing the search direction
    x = H.comm.bcast(x,root=0)
    res = H.comm.bcast(res,root=0)
    if res<tol:
            break
LossStar =  0.33691510558128357;
print("Lost funciton at this iteration {}, gradient norm {} and error {}.".format(LossSerial(x),np.linalg.norm(grad),abs(LossSerial(x)-LossStar)))
[stdout:0] Compressing Initial Hessian
Assembling the the comp ...
Built the compression ...
The master Hessian has been initialised
(FedNL) [Iteration. 0] Lost funciton at this iteration 1.2675940990447998  and gradient norm 1.388305902481079
Lost funciton at this iteration 0.3369157016277313, gradient norm 0.02016145922243595 and error 5.960464477539062e-07.

[stdout:1] Compressing Initial Hessian
Assembling the the comp ...
Built the compression ...
The master Hessian has been initialised
Lost funciton at this iteration 0.3369157016277313, gradient norm 0.020154612138867378 and error 5.960464477539062e-07.

[stderr:0]  46%|████▌     | 23/50 [01:20<01:33,  3.48s/it]

[stderr:1]  46%|████▌     | 23/50 [01:20<01:33,  3.48s/it]

[9]:
import matplotlib.pyplot as plt
Rs = c[:]["Res"][0]
plt.semilogy(range(len(Rs)),Rs)
plt.title("Residual Decay")
TBCs = c[:]["TBCHistory"][0]
TBCs[0]=1.
plt.semilogy(range(len(TBCs)),TBCs)
plt.title("Decay")
plt.legend(["Residual",r"$\sigma_0$ Compression"])
[9]:
<matplotlib.legend.Legend at 0x7fa2b6eb1610>
../_images/TensorFlow_FEDNL_13_1.png