wine clustering.Rmd

# Basic data cleaning and exploration

loading dataset in the workspace

# importing the wine dataset

```{r}
library(readr)
data <- read_csv("Wine 0.csv - Wine 0.csv.csv", show_col_types = F)
data.scaled <- as.data.frame(scale(data))
data.matrx <- as.matrix(as.data.frame(scale(data)))
```

# Check structure of the dataset and data types of variables
```{r}
dim(data)
```

```{r}
class(data)
```

```{r}
data <- as.data.frame(data)
class(data)
```

```{r}
# structure
str(data)
```
Observation : 
  
  1.   Every variable is of numeric (double) data types
  2.   There are 178 observation of 13 variables


# Head and tail of the data set

```{r}
# head
head(data)
```

```{r}
# tail
tail(data)
```

# Checking for null and other types of missing data

```{r}
# any NA in the dataframe
any(is.na(data))
```

```{r}
# total NAs
sum(is.na(data))
```

```{r}
# checkinf if there is any infinite valuesm
f<-function(x){any(is.infinite(x))}
sapply(data,f)
```

```{r}
# Checking for total infinite values
f <- function(x){sum(is.infinite(x))}
sapply(data,f)
```

```{r}
# checking for NaN values
f<-function(x){any(is.nan(x))}
sapply(data,f)
```

```{r}
# total NaN values
f <- function(x){sum(is.nan(x))}
sapply(data,f)
```

Observation: There are neither infinity nor ant NaN values in any variable in whole dataset.

# Univariate plot
```{r}
library(ggplot2)
histf <- function(var){ggplot(data, aes(x=var))+geom_histogram()}
lapply(data,histf)
```

```{r}
library(ggplot2)
boxf<- function(var){ggplot()+geom_boxplot(aes(y=var))}
lapply(data,boxf)

```

```{r}
summary(data)
```


```{r}
cor(data)
```

```{r}
#row names
names<-row.names(data)
names
```


```{r}
# column names
names(data)
```


```{r}
# means
apply(data,2,mean)
```
```{r}
#variances
apply(data,2,var)
```
```{r}
#principle component
pr.out<-prcomp(data,scale=T)
```

```{r}
names(pr.out)
```

```{r}
pr.out$center
```

```{r}
pr.out$scale
```

```{r}
pr.out$sdev
```

```{r}
# principle component loading vector
pr.out$rotation
```

```{r}
#principle component score vectors dimention
dim(pr.out$x)
```

```{r}
#principle component score vector
head(pr.out$x)
```


```{r}
# biplot
ggbiplot::ggbiplot(pr.out)
#biplot(pr.out,scale=0)
```
Observation:-
1. Alcohol, Color intensity, Proline, Magnesium, Ash seems to form a cluster
2. Hue, Total phenols, Flavanoids, 0D280/0D315 of diluted wines seems to form another cluster
3. Malic acid, Nonflavanoid phenols, Alcalinity of ash seems to form yet another cluster

```{r}
# variances explained by principle components
pr.var <- pr.out$sdev^2
pr.var
```


```{r}
#PVE proportion of variance explained by each principle components
pve <- pr.var/sum(pr.var)
pve
```
```{r}
# plot of PVE explained by each principle component
par(mfwow=c(2,1))
plot(pve, xlab="Principle Component", ylab = "Proportion of variance explained", ylim=c(0,1), type="b")
plot(cumsum(pve), xlab="principle component",ylab="Cumulative proportion of variance explained", ylim=c(0,1), type="b")
```
# Matrix completion

```{r}
x <- data.matrix(scale(data))
pcob <- prcomp(x)
summary(pcob)
```
```{r}
# singular value decomposition
sX<-svd(x)
names(sX)
round(sX$v,3)
```
```{r}
# compare with the above , both is same i.e. loading matrix
pcob$rotation
```

# Clustering

## K-means clustering

```{r}
#kmeans
set.seed(4)
for (i in 2:20) {
km.out<-kmeans(x,3,nstart = 20)
cat(km.out$tot.withinss," | ",km.out$totss," | ",km.out$betweenss,"\n")
}
```

```{r}
km.out
#km.out$cluster
f=function(x){x= mean(x)}
data.clust.kmeans <- cbind(data,as.factor(km.out$cluster)) %>% group_by('cluster') %>% sapply(., f %>% as.data.frame() %>% na.omit() %>% rbind(max_min,.)
data.clust.kmeans1 <-
```

```{r}
# Define the variable ranges: maximum and minimum
max_min <- data.frame(
  Alcohol = c(max(data$Alcohol), min(data$Alcohol)), `Malic acid` = c(max(data$`Malic acid`), min(data$`Malic acid`)), Ash = c(max(data$Ash), min(data$Ash)),
  `Alcalinity of ash` = c(max(data$`Alcalinity of ash`), min(data$`Alcalinity of ash`)), Magnesium = c(max(data$Magnesium), min(data$Magnesium)), `Total phenols` = c(max(data$`Total phenols`), min(data$`Total phenols`)),
  Flavanoids = c(max(data$Flavanoids), min(data$Flavanoids)), `Nonflavanoid phenols` = c(max(data$`Nonflavanoid phenols`), min(data$`Nonflavanoid phenols`)), Proanthocyanins = c(max(data$Proanthocyanins), min(data$Proanthocyanins)), `Color intensity` = c(max(data$`Color intensity`), min(data$`Color intensity`)), Hue = c(max(data$Hue), min(data$Hue)), `OD280/OD315 of diluted wines` = c(max(data$`OD280/OD315 of diluted wines`), min(data$`OD280/OD315 of diluted wines`)), Proline = c(max(data$Proline), min(data$Proline)) 
)
rownames(max_min) <- c("Max", "Min")

# Bind the variable ranges to the data
```

```{r}
par(mfrow=c(1,2))
plot(data,col=(km.out$cluster+1),pch=20,cex=2)
```
# Hierarchical Clustering

```{r}
# hierarchical clustering
hc.complete<-hclust(dist(x),method = "complete")
hc.average<-hclust(dist(x),method = "average")
hc.single<-hclust(dist(x),method = "single")
```

```{r}
str(as.dendrogram(hc.average))
```


```{r}
library(dplyr)
library(stats)
nP <- list(col = 3:2, cex = c(2.0, 0.75), pch =  21:22,
           bg =  c("light blue", "pink"),
           lab.cex = 0.75, lab.col = "tomato")
#plot(d3, nodePar= nP, edgePar = list(col = "gray", lwd = 2), horiz = TRUE)
as.dendrogram(hc.average)%>%plot(., nodePar=nP, edgePar=list(col="grey",lwd=2),horiz=T)
```

```{r}
# dendrograms plot
par(mfrow=c(1,3))
plot(hc.complete,main="Complete Linkage", xlab="",sub="",cex=.9)
plot(hc.average,main="Average Linkage", xlab="",sub="",cex=.9)
plot(hc.single,main="Single Linkage", xlab="",sub="",cex=.9)
```

```{r}
# each wine's cluster
cutree(hc.complete,3)
cutree(hc.average,4)
cutree(hc.single,5)
```

## correlation based distance

```{r}
dd<-as.dist(1-cor(t(x)))
plot(hclust(dd,method = "complete"),main="Complete linkage with correlation-based distance",xlab="",sub="")
```