$\newcommand{\W}[1]{ \; #1 \; } \newcommand{\R}[1]{ {\rm #1} } \newcommand{\B}[1]{ {\bf #1} } \newcommand{\D}[2]{ \frac{\partial #1}{\partial #2} } \newcommand{\DD}[3]{ \frac{\partial^2 #1}{\partial #2 \partial #3} } \newcommand{\Dpow}[2]{ \frac{\partial^{#1}}{\partial {#2}^{#1}} } \newcommand{\dpow}[2]{ \frac{ {\rm d}^{#1}}{{\rm d}\, {#2}^{#1}} }$
Differentiate Conjugate Gradient Algorithm: Example and Test

Purpose
The conjugate gradient algorithm is sparse linear solver and a good example where checkpointing can be applied (for each iteration). This example is a preliminary version of a new library routine for the conjugate gradient algorithm.

Algorithm
Given a positive definite matrix $A \in \B{R}^{n \times n}$, a vector $b \in \B{R}^n$, and tolerance $\varepsilon$, the conjugate gradient algorithm finds an $x \in \B{R}^n$ such that $\| A x - b \|^2 / n \leq \varepsilon^2$ (or it terminates at a specified maximum number of iterations).
1. Input:  The matrix $A \in \B{R}^{n \times n}$, the vector $b \in \B{R}^n$, a tolerance $\varepsilon \geq 0$, a maximum number of iterations $m$, and the initial approximate solution $x^0 \in \B{R}^n$ (can use zero for $x^0$).
2. Initialize:  $g^0 = A * x^0 - b$, $d^0 = - g^0$, $s_0 = ( g^0 )^\R{T} g^0$, $k = 0$.
3. Convergence Check:  if $k = m$ or $\sqrt{ s_k / n } < \varepsilon$, return $k$ as the number of iterations and $x^k$ as the approximate solution.
4. Next $x$:  $\mu_{k+1} = s_k / [ ( d^k )^\R{T} A d^k ]$, $x^{k+1} = x^k + \mu_{k+1} d^k$.
5. Next $g$:  $g^{k+1} = g^k + \mu_{k+1} A d^k$, $s_{k+1} = ( g^{k+1} )^\R{T} g^{k+1}$.
6. Next $d$:  $d^{k+1} = - g^k + ( s_{k+1} / s_k ) d^k$.
7. Iterate:  $k = k + 1$, goto Convergence Check.
 # include <cppad/cppad.hpp> # include <cstdlib> # include <cmath> namespace { // Begin empty namespace using CppAD::AD; // A simple matrix multiply c = a * b , where a has n columns // and b has n rows. This should be changed to a function so that // it can efficiently handle the case were A is large and sparse. template <class Vector> // a simple vector class void mat_mul(size_t n, const Vector& a, const Vector& b, Vector& c) { typedef typename Vector::value_type scalar; size_t m, p; m = a.size() / n; p = b.size() / n; assert( m * n == a.size() ); assert( n * p == b.size() ); assert( m * p == c.size() ); size_t i, j, k, ij; for(i = 0; i < m; i++) { for(j = 0; j < p; j++) { ij = i * p + j; c[ij] = scalar(0); for(k = 0; k < n; k++) c[ij] = c[ij] + a[i * m + k] * b[k * p + j]; } } return; } // Solve A * x == b to tolerance epsilon or terminate at m interations. template <class Vector> // a simple vector class size_t conjugate_gradient( size_t m , // input double epsilon , // input const Vector& A , // input const Vector& b , // input Vector& x ) // input / output { typedef typename Vector::value_type scalar; scalar mu, s_previous; size_t i, k; size_t n = x.size(); assert( A.size() == n * n ); assert( b.size() == n ); Vector g(n), d(n), s(1), Ad(n), dAd(1); // g = A * x mat_mul(n, A, x, g); for(i = 0; i < n; i++) { // g = A * x - b g[i] = g[i] - b[i]; // d = - g d[i] = -g[i]; } // s = g^T * g mat_mul(n, g, g, s); for(k = 0; k < m; k++) { s_previous = s[0]; if( s_previous < epsilon ) return k; // Ad = A * d mat_mul(n, A, d, Ad); // dAd = d^T * A * d mat_mul(n, d, Ad, dAd); // mu = s / d^T * A * d mu = s_previous / dAd[0]; // g = g + mu * A * d for(i = 0; i < n; i++) { x[i] = x[i] + mu * d[i]; g[i] = g[i] + mu * Ad[i]; } // s = g^T * g mat_mul(n, g, g, s); // d = - g + (s / s_previous) * d for(i = 0; i < n; i++) d[i] = - g[i] + ( s[0] / s_previous) * d[i]; } return m; } } // End empty namespace bool conj_grad(void) { bool ok = true; // ---------------------------------------------------------------------- // Setup // ---------------------------------------------------------------------- using CppAD::AD; using CppAD::NearEqual; using CppAD::vector; using std::cout; using std::endl; size_t i, j; // size of the vectors size_t n = 40; vector<double> D(n * n), Dt(n * n), A(n * n), x(n), b(n), c(n); vector< AD<double> > a_A(n * n), a_x(n), a_b(n); // D = diagonally dominant matrix // c = vector of ones for(i = 0; i < n; i++) { c[i] = 1.; double sum = 0; for(j = 0; j < n; j++) if( i != j ) { D[ i * n + j ] = std::rand() / double(RAND_MAX); Dt[j * n + i ] = D[i * n + j ]; sum += D[i * n + j ]; } Dt[ i * n + i ] = D[ i * n + i ] = sum * 1.1; } // A = D^T * D mat_mul(n, Dt, D, A); // b = D^T * c mat_mul(n, Dt, c, b); // copy from double to AD<double> for(i = 0; i < n; i++) { a_b[i] = b[i]; for(j = 0; j < n; j++) a_A[ i * n + j ] = A[ i * n + j ]; } // --------------------------------------------------------------------- // Record the function f : b -> x // --------------------------------------------------------------------- // Make b the independent variable vector Independent(a_b); // Solve A * x = b using conjugate gradient method double epsilon = 1e-7; for(i = 0; i < n; i++) a_x[i] = AD<double>(0); size_t m = n + 1; size_t k = conjugate_gradient(m, epsilon, a_A, a_b, a_x); // create f_cg: b -> x and stop tape recording CppAD::ADFun<double> f(a_b, a_x); // --------------------------------------------------------------------- // Check for correctness // --------------------------------------------------------------------- // conjugate gradient should converge with in n iterations ok &= (k <= n); // accuracy to which we expect values to agree double delta = 10. * epsilon * std::sqrt( double(n) ); // copy x from AD<double> to double for(i = 0; i < n; i++) x[i] = Value( a_x[i] ); // check c = A * x mat_mul(n, A, x, c); for(i = 0; i < n; i++) ok &= NearEqual(c[i] , b[i], delta , delta); // forward computation of partials w.r.t. b[0] vector<double> db(n), dx(n); for(j = 0; j < n; j++) db[j] = 0.; db[0] = 1.; // check db = A * dx delta = 5. * delta; dx = f.Forward(1, db); mat_mul(n, A, dx, c); for(i = 0; i < n; i++) ok &= NearEqual(c[i], db[i], delta, delta); return ok; }