"git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "ffe735bacfa918e6e21a5d751fd07afab3faaa15"
cca.cpp 5.98 KB
Newer Older
1
2
// Copyright (C) 2013  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
Davis King's avatar
Davis King committed
3

4
#include <dlib/python.h>
Davis King's avatar
Davis King committed
5
6
7
#include <dlib/statistics.h>

using namespace dlib;
8
namespace py = pybind11;
Davis King's avatar
Davis King committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

typedef std::vector<std::pair<unsigned long,double> > sparse_vect;

struct cca_outputs
{
    matrix<double,0,1> correlations;
    matrix<double> Ltrans;
    matrix<double> Rtrans;
};

cca_outputs _cca1 (
    const std::vector<sparse_vect>& L,
    const std::vector<sparse_vect>& R,
    unsigned long num_correlations,
    unsigned long extra_rank,
    unsigned long q,
    double regularization
) 
{ 
    pyassert(num_correlations > 0 && L.size() > 0 && R.size() > 0 && L.size() == R.size() && regularization >= 0,
        "Invalid inputs");

    cca_outputs temp;
    temp.correlations = cca(L,R,temp.Ltrans,temp.Rtrans,num_correlations,extra_rank,q,regularization); 
    return temp;
}

// ----------------------------------------------------------------------------------------

unsigned long sparse_vector_max_index_plus_one (
    const sparse_vect& v
)
{
    return max_index_plus_one(v);
}

matrix<double,0,1> apply_cca_transform (
    const matrix<double>& m,
    const sparse_vect& v
)
{
50
    pyassert((long)max_index_plus_one(v) <= m.nr(), "Invalid Inputs");
Davis King's avatar
Davis King committed
51
52
53
    return sparse_matrix_vector_multiply(trans(m), v);
}

54
void bind_cca(py::module& m)
Davis King's avatar
Davis King committed
55
{
56
57
58
59
    py::class_<cca_outputs>(m, "cca_outputs")
        .def_readwrite("correlations", &cca_outputs::correlations)
        .def_readwrite("Ltrans", &cca_outputs::Ltrans)
        .def_readwrite("Rtrans", &cca_outputs::Rtrans);
Davis King's avatar
Davis King committed
60

61
    m.def("max_index_plus_one", sparse_vector_max_index_plus_one, py::arg("v"),
Davis King's avatar
Davis King committed
62
63
64
"ensures    \n\
    - returns the dimensionality of the given sparse vector.  That is, returns a    \n\
      number one larger than the maximum index value in the vector.  If the vector    \n\
65
      is empty then returns 0.   "
Davis King's avatar
Davis King committed
66
67
68
    );


69
    m.def("apply_cca_transform", apply_cca_transform, py::arg("m"), py::arg("v"),
Davis King's avatar
Davis King committed
70
71
72
73
74
75
76
77
"requires    \n\
    - max_index_plus_one(v) <= m.nr()    \n\
ensures    \n\
    - returns trans(m)*v    \n\
      (i.e. multiply m by the vector v and return the result)   " 
    );


78
    m.def("cca", _cca1, py::arg("L"), py::arg("R"), py::arg("num_correlations"), py::arg("extra_rank")=5, py::arg("q")=2, py::arg("regularization")=0,
Davis King's avatar
Davis King committed
79
80
81
82
"requires    \n\
    - num_correlations > 0    \n\
    - len(L) > 0     \n\
    - len(R) > 0     \n\
Davis King's avatar
Davis King committed
83
    - len(L) == len(R)    \n\
Davis King's avatar
Davis King committed
84
    - regularization >= 0    \n\
Davis King's avatar
Davis King committed
85
86
87
    - L and R must be properly sorted sparse vectors.  This means they must list their  \n\
      elements in ascending index order and not contain duplicate index values.  You can use \n\
      make_sparse_vector() to ensure this is true.  \n\
Davis King's avatar
Davis King committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
ensures    \n\
    - This function performs a canonical correlation analysis between the vectors    \n\
      in L and R.  That is, it finds two transformation matrices, Ltrans and    \n\
      Rtrans, such that row vectors in the transformed matrices L*Ltrans and    \n\
      R*Rtrans are as correlated as possible (note that in this notation we    \n\
      interpret L as a matrix with the input vectors in its rows).  Note also that    \n\
      this function tries to find transformations which produce num_correlations    \n\
      dimensional output vectors.    \n\
    - Note that you can easily apply the transformation to a vector using     \n\
      apply_cca_transform().  So for example, like this:     \n\
        - apply_cca_transform(Ltrans, some_sparse_vector)    \n\
    - returns a structure containing the Ltrans and Rtrans transformation matrices    \n\
      as well as the estimated correlations between elements of the transformed    \n\
      vectors.    \n\
Davis King's avatar
Davis King committed
102
103
104
105
    - This function assumes the data vectors in L and R have already been centered    \n\
      (i.e. we assume the vectors have zero means).  However, in many cases it is    \n\
      fine to use uncentered data with cca().  But if it is important for your    \n\
      problem then you should center your data before passing it to cca().   \n\
Davis King's avatar
Davis King committed
106
107
108
109
110
111
112
113
    - This function works with reduced rank approximations of the L and R matrices.    \n\
      This makes it fast when working with large matrices.  In particular, we use    \n\
      the dlib::svd_fast() routine to find reduced rank representations of the input    \n\
      matrices by calling it as follows: svd_fast(L, U,D,V, num_correlations+extra_rank, q)     \n\
      and similarly for R.  This means that you can use the extra_rank and q    \n\
      arguments to cca() to influence the accuracy of the reduced rank    \n\
      approximation.  However, the default values should work fine for most    \n\
      problems.    \n\
Davis King's avatar
Davis King committed
114
115
116
117
118
119
    - The dimensions of the output vectors produced by L*#Ltrans or R*#Rtrans are \n\
      ordered such that the dimensions with the highest correlations come first. \n\
      That is, after applying the transforms produced by cca() to a set of vectors \n\
      you will find that dimension 0 has the highest correlation, then dimension 1 \n\
      has the next highest, and so on.  This also means that the list of estimated \n\
      correlations returned from cca() will always be listed in decreasing order. \n\
Davis King's avatar
Davis King committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    - This function performs the ridge regression version of Canonical Correlation    \n\
      Analysis when regularization is set to a value > 0.  In particular, larger    \n\
      values indicate the solution should be more heavily regularized.  This can be    \n\
      useful when the dimensionality of the data is larger than the number of    \n\
      samples.    \n\
    - A good discussion of CCA can be found in the paper \"Canonical Correlation    \n\
      Analysis\" by David Weenink.  In particular, this function is implemented    \n\
      using equations 29 and 30 from his paper.  We also use the idea of doing CCA    \n\
      on a reduced rank approximation of L and R as suggested by Paramveer S.    \n\
      Dhillon in his paper \"Two Step CCA: A new spectral method for estimating    \n\
      vector models of words\".   " 
        
        );
}