## raw data - X

# (non centred) iris dataset, excluding the categorical variable ''species''
X <- as.matrix(iris[,-5]) 

head(X) ## displays the 6 first rows of X

cls <- as.integer(iris[,5]) ## classes of the 150 iris flowers converted to integers
head(cls) ## displays the 6 first classes 

lev <- unique(cls) ## distinct levels
lev ## displays the distinct levels

N <- nrow(X) # number of individuals
p <- ncol(X) # number of variables 
k <- length(lev) # number of classes 

N; p ; k ## returns these 3 numbers

## centred data - Xc

m.G <- colMeans(X) 
### centroid of the individuals cloud 
## (overall mean vector)

round(m.G,5) ## displays the centroid of X rounded to 6 digits

Xc <- scale(X, scale=FALSE) ## (column) centred data matrix

head(X-Xc) 
## should be equal to 6 copies of m.G 
## (since each row i of Xc = row i of X - m.G)

# matrix of within-class variability: Sw

Xw=Xc # to start with 

for (j in 1:k){  # j ranges from 1 to  k (number of groups)
    
  Xj <- Xc[cls==lev[j],] # submatrix of Xc containing the rows corresponding 
                         # to individuals of group j, 
                         # i.e.,  with the same level lev[j].
  
  Xjc <- scale(Xj,scale=FALSE)  # Xjc = column centered submatrix  Xj.
  
  Xw[cls==lev[j],] <- Xjc  # replaces Xj by the column centred submatrix Xjc.
}

head(Xw) # displays the 6 first rows of Xw

Sw=cov(Xw)

Sw # displays the covariance matrix Sw

# matrix of between-class variability: Sb

Xb = Xc - Xw  

# displays 6 first rows of Xb = 6 copies of the centroid of the setosa group
head(Xb) 

Sb=cov(Xb)

Sb # displays the covariance matrix Sb

# overall covariance matrix: S 

S <- cov(X) ## 

S ; Sw + Sb # should be equal (S=Sb+Sw)



invSw <- solve(Sw) # solve(Sw) = inverse matrix of Sw
eigenData<-eigen(invSw %*% Sb)  
eigenData

round(eigenData$values,5)  

loadings.LDA<-Re(eigenData$vectors[,1:(k-1)])
loadings.LDA 

t(loadings.LDA) %*% loadings.LDA 

discrCapacities<-Re(eigenData$values[1:(k-1)])
round(discrCapacities,5)

lambda1=discrCapacities[1]
round(lambda1,5)

lambda2=discrCapacities[2]
round(lambda2,5)

a1 <- loadings.LDA[,1] ### second column of the loadings matrix
a1
round(lambda1,5)
t(a1)%*% Sb %*% a1 / (t(a1)%*% Sw %*% a1) ;  # should be equal to lambda1

a2 <- loadings.LDA[,2] ### second column of the loadings matrix
a2
round(lambda2,5)
t(a2)%*% Sb %*% a2 / (t(a2)%*% Sw %*% a2) ;  # should be equal to lambda2

## proportion of trace 

J=sum(diag( invSw %*% Sb)) ## trace of the matrix Sw^(-1)Sb
round(J,5)  ### discriminant power rounded to 5 digits 
round(lambda1/J,5) ### prop. of discriminating capacity accounted by LDA1 
round(lambda2/J,5) ### prop. of discriminating capacity accounted by LDA1 

scores.LDA <- Xc %*% loadings.LDA 
head(scores.LDA)

library(repr) 
par(mfrow=c(1,1))

options(repr.plot.width =15, repr.plot.height = 10) ## plot windows dimensions

plot(scores.LDA,col=cls,pch=16,asp=TRUE,cex=1.5,
     main="Projection of the iris flower dataset in the plane defined by LDA1 and LDA2")

abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")

text(scores.LDA[,1],scores.LDA[,2],labels=1:N, col=cls,cex=1,pos=1)

### PROJECTION AND HISTOGRAM ON LDA1

library(ggplot2)
library(repr)

df_1 = data.frame(x = scores.LDA[,1], y = rep(0,N))

options(repr.plot.width = 10.5, repr.plot.height = 3)

hist(scores.LDA[1:50,1],xlim=c(-2.25,2.15),ylim=c(0,25),col=cls[1:50],main="Histograms of the scores on LDA1 w.r.t the 3 species")
hist(scores.LDA[51:100,1],col=cls[51:100],add=TRUE)
hist(scores.LDA[101:150,1],col=cls[101:150],add=TRUE)

options(repr.plot.width = 10.65, repr.plot.height = 4)

ggplot(data=df_1, aes(x = x, y = "")) + 
    geom_point(size = 6,color=cls,pch=1) +  
    theme(text = element_text(size = 20), element_line(linewidth = 10)) +
    labs(title="", subtitle = "",
       x="Discriminant scores in LDA1",
       y="")

With regard to the second discriminant axis LDA2, the projection of the 3 iris classes appear rather mixed.
### PROJECTION AND HISTOGRAM ON LDA2

library(ggplot2)
library(repr)

df_2 = data.frame(x = rep(0,N), y = scores.LDA[,2])


options(repr.plot.width = 10, repr.plot.height = 4)

hist(scores.LDA[1:50,2],xlim=c(-0.65,0.7),ylim=c(0,25),col=cls[1:50],main="Histograms of the scores on LDA2 w.r.t. the 3 species")
hist(scores.LDA[51:100,2],col=cls[51:100],add=TRUE)
hist(scores.LDA[101:150,2],col=cls[101:150],add=TRUE)

options(repr.plot.width = 10, repr.plot.height = 3)

ggplot(data=df_2, aes(x = y, y = "")) + 
    geom_point(size = 6,color=cls,pch=1) +  
    theme(text = element_text(size = 20), element_line(linewidth = 10)) +
    labs(title="", subtitle = "",
       y="",
       x="Discriminant scores in LDA2")

X.ACP <-  prcomp(X) ## performs PCA on the covariance matrix of X 
X.ACP 

### discriminating capacity of LDA1

loadings.ACP<-X.ACP$rotation 

u1<-loadings.ACP[,1] # vector of coefficients defining the first PC 
round(u1,5)

discr.acp.1<-(t(u1) %*% Sb %*% u1)/(t(u1) %*% Sw %*% u1) 
discr.acp.1

### discriminating capacity of LDA1

u2<-loadings.ACP[,2] # vector of coefficients defining the second PC
round(u2,5)

discr.acp.2<-(t(u2) %*% Sb %*% u2)/(t(u2) %*% Sw %*% u2) 
discr.acp.2
The explained variances of PC1, PC2, PC3 and PC4 can be obtained as follows:
round(X.ACP$sdev^2,5)

round(var(X.ACP$x[,1]),5); round(var(X.ACP$x[,2]),5); round(var(X.ACP$x[,3]),5); round(var(X.ACP$x[,4]),5)

round(diag(var(X.ACP$x)),5)

round(diag(var(scores.LDA)),5)

u1 ; u2

round( t(u2) %*% u1, 7)  
# u1^T u1 is zero  =>  u1 and u2 are pairwise orthogonal 

round( cov(X %*% u1, X %*% u2), 7)  
## cov(X u1,X u2) = 0 => PC1 and PC2 are noncorrelated

a1 ; a2

round( t(a2) %*% a1, 7) 
# a_2^Ta_1 is not zero  =>  a1 and a2 are not pairwise orthogonal 

round( t(a2) %*% Sw %*% a1, 7)  
# a_2^T Sw a_1 = 0 => a1 and a2 are pairwise Sw-orthogonal
round( cov(Xc %*% a1, Xc %*% a2), 7)  
## cov(Xc a_1,Xc a_2) = 0 => LDA1 and LDA2 are noncorrelated

### new unknwon flower that we pretend to classify

y <- c(6.2, 3.2, 5, 1.9) 
## original sepal and petal measurements of an hypothetical 
## iris flower of unknown species 

m.G <- as.vector(colMeans(X)) ## centroid of the training data (iris dataset)

y <- y - m.G # replaces y by y - m.G (coordinates of y w.r.t the centroid m.G)

dim(y) <- c(4,1)  
# ensures that y is a with $p$ rows and one column
# (column matrix)

y

scores.y <- t(y) %*% loadings.LDA  ## L(y) = (L1(y),L2(y))

scores.y[1] ### L1(y)

scores.y[2] ### L2(y) 

bar.z <- rep(NA,k) ## empty vector with k components
for (i in 1: k){
  bar.z [i]<-mean(scores.LDA[cls==lev[i],1])
}
bar.z ## centroids of the scores (in LDA1) of setosa, versicolor and virginica flowers. 

## rep(scores.y[1],k): vector containing k copies of L1(y)
d.cls=abs((rep(scores.y[1],k) - bar.z)) 
d.cls

ord <- order(d.cls) 
### gives the order of the indices that increasingly 
### sort the values of the vector d.cls

ord
ord[1]

pred.y<-lev[ord[1]]
pred.y

L.discr <- function(df.train,cls.train,df.test=df.train){ 
  
    # df.train: a numerical dataframe or data.matrix with N rows 
    # and p columns used as training data
  
    # cls.train: a vector of length N containing the levels of each individual (row).
  
    # df.test: a numerical (Mxp) dataframe or data.matrix, whose rows 
    # we pretend classify. By default df.test = df.train, if omitted. 
  
  
    X.train <- as.matrix(df.train)  # converts data.frame to a data matrix
 
    X.test <- as.matrix(df.test)  # converts data.frame to a data matrix

    p = ncol(X.train)  # number of variables
    
    N = nrow(X.train)  # number of training individuals, already classified 
    
    M = length(X.test)/p  # number of testing individuals, i,e., to be classified
  
    dim(X.test) <- c(M,p) 
    # to ensure  X.test is defined as a matrix with M rows and p columns
  
    Xc <- scale(X.train,scale=FALSE)  # Xc = column centred training data
  
    m.G <- colMeans(X.train) # centre of gravity of the training data
  
    X.test.c <- X.test-matrix(rep(m.G,M),nrow=M,byrow=TRUE)
    # computes the coordinates of the testing individuals 
    # w.r.t. the centroid m.G of the training data

    cls <- as.vector(cls.train)  # to ensure cls is defined as a vector

    lev <- unique(cls)  # vector containing the distinct (factor) levels 
  
    k = length(lev)  # number of distinct (factor) levels 
  
    S = cov(X)  ## overall covariance matrix 

    
    ### WITHIN-CLASS AND BETWEEN-CLASS SCATTER MATRICES Sw AND Sb
  
    Xw = Xc 
  
    for (j in 1:k){   
        Xj <- Xc[cls==lev[j],]  
        Xw[cls==lev[j],] <- scale(Xj,scale=FALSE)   
    }
  
    Sw = cov(Xw)
  
    Xb = Xc-Xw  
    
    Sb = cov(Xb) # should be equal to S-Sw

    invSw = solve(Sw) # inverse matrix of Sw
  
    eigenData <- eigen(invSw %*% Sb)  # %*% product of matrices 
  
    J <- sum(diag(invSw %*% Sb))  
  
    # JUST TO RECALL :)
    # J is the trace (sum of the diagonal elements) of the matrix  
    # invSw %*%Sb, which corresponds to the sum of the discriminating 
    # capacities for all discriminant axes, also called discriminant 
    # power or discriminatory information of X, which we want to maximize. 
    
    # The eigenvector associated with the largest eigenvalue of inv(Sw)Sb
    # defines the linear combination of the columns of the (centred) matrix X,
    # maximizing the ratio var(Xb a)/ var (Xw a) = (a^T Sb a) / (a^T Sw a), 
    # (between class variability / within class variability), 
    # i.e., defines the first discriminant axis 
    # and the corresponding eigenvalue is the value of the ratio, 
    # called discriminating capacity of the axis.
  
    # The eigenvector associated with the 2nd largest eigenvalue of invSw %*% Sb
    # defines the second discriminatant axis, i.e.,
    # with the second largest discriminating capacity that is uncorrelated 
    # with the first discr. axis (but in general not orthogonal to LDA1, 
    # only W-orthogonal).
    # The corresponding eigenvalue is the 2nd largest value of the ratio
    # intergroup variance / intragroup variance. 
  
    # And so on... 
  
    # The number of discriminant axes is, at most, the number of classes minus one. 

    # Loadings
    loadings.LDA <- Re(eigenData$vectors[,1:(k-1)]) # similar to PCA
  
    dim(loadings.LDA) <- c(p,k-1)
  
    colnames(loadings.LDA) <- paste0(rep("LD",k-1),1:(k-1)) 
    # to name the discr axes as LD1, LD2, ... 
    
    rownames(loadings.LDA) <- colnames(df.train) 
    #  to name the rows of loading matrix with the names of the variables 

    # discriminating capacities
    discrCapacities <- Re(eigenData$values[1:(k-1)])
    names(discrCapacities) <- paste0(rep("LD",k-1),1:(k-1)) 
    prop.trace <- discrCapacities/sum(discrCapacities)
  
    # discriminant scores
    scores.LDA <- Xc %*% loadings.LDA # similar to PCA 
    
    colnames(scores.LDA) <- paste0(rep("LD",k-1),1:(k-1)) 
    
    rownames(scores.LDA) <- rownames(df.train) 
    # to name the rows of scores matrix with the names of the individuals
  
    bar.Z <- matrix(NA,k-1,k) 
    # column j of this matrix will contains the centroid vector bar.zj 
    # of the scores of the elements of class j 

    for (j in 1:k){
        Xj = scores.LDA[cls==lev[j],]
        dim(Xj) <- c(sum(cls==lev[j]),k-1)
        bar.Z[,j] <- colMeans(Xj)
    }
  
    scores.test <- X.test.c %*% loadings.LDA  
    dim(scores.test) <- c(M,k-1)
    colnames(scores.test) <- paste0(rep("LD",k-1),1:(k-1))
    rownames(scores.test) <- rownames(df.test)
  
    pred.test <- rep(NA,M)
  
    for (i in 1:M){
        scores.test.aux <- matrix(rep(scores.test[i,],k),k-1,k)
        dif.aux <- scores.test.aux - bar.Z
        dif.cls <- colSums(dif.aux^2)
        ord <- order(dif.cls)
        pred.test[i] <- lev[ord[1]]
    }
  
   # The function returns a list consisting of several components:

    out<-list(discr.load = loadings.LDA, # loadings defining the discr. axes
              
            discr.cap = discrCapacities, # resp. discr. capacities
              
            trace = J,  # total discriminatory information = sum discr.cap 
              
            prop.trace = prop.trace, # discr.cap / J - analog to explained variance
              
            scores.train = scores.LDA, 
            # coord of the N training individ in the discr. axes
              
            scores.test = scores.test, 
            # coord of the M testing individ in the discr. axes
              
            pred.class = pred.test # predicted classes for the M testing individuals
           )
    return(out) 
}

df.train<-iris[,-5]
cls.train<-as.integer(iris[,5])
df.test<-c(6.2,3.2,5,1.9)

res.y<-L.discr(df.train,cls.train,df.test)
res.y

par(mfrow=c(1,1))
options(repr.plot.width =15, repr.plot.height = 10) ## windows dimensions
plot(res.y$scores.train,pch=16,cex=2,asp=TRUE,col=cls.train)
abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")
points(res.y$scores.test[,1],res.y$scores.test[,2],cex=3,pch=4,col=res.y$pred.class)

## Original iris datasets and the testing y vector 

head(iris) ## original iris dataset with all variables in cm
y=c(6.2,3.2,5,1.9)
y

## Modified versions of iris dataset and (testing) y vector 
## to became expressed in the new units of measurements

iris.mod <- iris
iris.mod[,1] <- iris[,1]*10 ## sepal lengths in mm
iris.mod[,3] <- iris[,3]/100 ## petal lengths in m

head(iris.mod) 
var(iris.mod[,-5])  ## covariance matrix 

y.mod <- y*c(10,1,.01,1) 
y.mod

## Applying the L.discr function to the modified data

df.train <- iris.mod[,-5]
cls.train <- as.integer(iris.mod[,5])
df.test <- y.mod

res.mod <- L.discr(df.train,cls.train,df.test) ## testing data = training data

## Comparing the discriminating capacities and the discriminant scores outcomes 
## with the original vs modified data

# The discriminating capacities are the same
res.y$discr.cap ; res.mod$discr.cap  

## The vectors of discriminant scores on each discriminant axis are proportional 
head(res.y$scores.train / res.mod$scores.train)

par(mfrow=c(2,1))
options(repr.plot.width =12, repr.plot.height = 10) ## to define the plot windows dimensions

plot(res.y$scores.train,pch=16,cex=2,col=cls.train, main="Original iris dataset")
abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")
points(res.y$scores.test[,1],res.y$scores.test[,2],cex=3,pch=4,col=res.y$pred.class)

plot(res.mod$scores.train,pch=16,cex=2,col=cls.train, main="Modified iris dataset")
abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")
points(res.mod$scores.test[,1],res.mod$scores.test[,2],cex=3,pch=4,col=res.mod$pred.class)

## The clouds shapes are the same (up to the axes rescalling and reorientation) 

iris.acp<-prcomp(iris[,-5])
iris.acp.mod<-prcomp(iris.mod[,-5])

par(mfrow=c(2,1))
options(repr.plot.width =12, repr.plot.height = 10) ## plot windows dimensions

plot(iris.acp$x,pch=16,cex=2,col=cls.train, main="Original iris dataset")
abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")

plot(iris.acp.mod$x,pch=16,cex=2,col=cls.train, main="Modified iris dataset")
abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")

X1=sort(sample(1:50,25))
X2=sort(sample(51:100,25))
X3=sort(sample(101:150,25))


df.train<-iris[c(X1,X2,X3),1:4]  ## training individuals  

cls.train<-as.integer(iris[c(X1,X2,X3),5])  ## resp. training classes 
 
df.test<-iris[-c(X1,X2,X3),1:4]  ## testing individuals

cls.test<-as.integer(iris[-c(X1,X2,X3),5]) ##  actual testing classes 


res<-L.discr(df.train,cls.train,df.test)

res$discr.cap

head(res$discr.load)

head(res$prop.trace)

head(res$scores.test)

### To display the testing data with the actual and predicted classes 

par(mfrow=c(1,1))
options(repr.plot.width =12, repr.plot.height = 8) ## to plot windows dimensions

plot(res$scores.test,pch=1,asp=TRUE,cex=2.5,col=cls.train)
### emptied larger dots - actual testing classes

abline(v=0,lty=2,col="gray")
abline(h=0,lty=2,col="gray")

points(res$scores.test[,1],res$scores.test[,2],col=res$pred.class,cex=1.5,pch=16) 
### filled smaller dots - predicted  testing classes

#confusion matrix 
actual <- factor(cls.test)
predicted <- factor(res$pred.class)

table(actual,predicted)

## Applying the lda function to iris dataset 

library(MASS)

lda.train  <- iris[,-5] 

lda.groups <- as.factor(as.integer(iris[,5])) 

lda.res <- lda(lda.groups ~ . ,data=lda.train)  
## the iris factor is the response variable!

lda.res

## Discriminanting capacities - lda (MASS)  
lda.eig <- lda.res$svd^2
lda.eig 

# Matrix of scores - lda (MASS) 
pred.iris <- predict(lda.res)
scores.iris <- pred.iris$x
class.iris <- pred.iris$class
head(scores.iris)
head(class.iris)

# Matrix of loadings - lda (MASS)
lda.res$scaling

Zw = scores.iris
for (j in 1:k){      
    Zj <- Zw[lda.groups==lev[j],] 
  
    Zjc <- scale(Zj,scale=FALSE)
  
    Zw[lda.groups==lev[j],] <- Zjc
}

round(cov(Zw),7)

N<-nrow(lda.train)
k<-length(unique(lda.groups))
N ; k
res<-L.discr(lda.train,lda.groups)
lda.res$svd^2 *(k-1)/(N-k) ; res$discr.cap  ### should be equal 

# Confusion matrices

actual <- lda.groups
pred.lda <- pred.iris$class
pred.our <- res$pred.class

table(pred.lda,pred.our) 
table(actual,pred.lda) 

par(mfrow=c(2,1))
plot(scores.iris,pch=16,col=class.iris,cex=2, main="lda")
plot(res$scores.train[,1],res$scores.train[,2],pch=16,col=res$pred.class,cex=2,main="L.discr")

### computes the Mahalanobis distance between x,y, 
### assocociated with an invertible covariance matrix S
d.Maha <- function(x,y,S){
    t(x-y) %*% solve(S) %*% (x-y)
}


