1 Introduction

The Iris flower data set is a renowned multivariate dataset introduced by the British statistician and biologist Ronald Fisher in 1936 in his paper “The use of multiple measurements in taxonomic problems” as a demonstration of linear discriminant analysis. This dataset is alternatively referred to as Anderson’s Iris data set due to Edgar Anderson’s role in gathering the data to assess the morphological diversity among three closely related species of Iris flowers. The samples were meticulously collected from two species in the Gaspé Peninsula under uniform conditions to ensure consistency.

Comprising 50 samples from each of the three Iris species (Iris setosa, Iris virginica, and Iris versicolor), the dataset includes measurements of four features - sepal length, sepal width, petal length, and petal width - all recorded in centimeters. Fisher utilized these features to construct a linear discriminant model for species classification. The original publication of Fisher’s work appeared in the Annals of Eugenics, now recognized as the Annals of Human Genetics.[1]

2 Preping

2.1 Summary of Data

dados <- read.csv("input/iris/Iris.csv")
head(dados, 6)
##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa
summary(dados)
##        Id         SepalLengthCm    SepalWidthCm   PetalLengthCm  
##  Min.   :  1.00   Min.   :4.300   Min.   :2.000   Min.   :1.000  
##  1st Qu.: 38.25   1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600  
##  Median : 75.50   Median :5.800   Median :3.000   Median :4.350  
##  Mean   : 75.50   Mean   :5.843   Mean   :3.054   Mean   :3.759  
##  3rd Qu.:112.75   3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100  
##  Max.   :150.00   Max.   :7.900   Max.   :4.400   Max.   :6.900  
##   PetalWidthCm     Species         
##  Min.   :0.100   Length:150        
##  1st Qu.:0.300   Class :character  
##  Median :1.300   Mode  :character  
##  Mean   :1.199                     
##  3rd Qu.:1.800                     
##  Max.   :2.500

3 Visualization

3.1 Attribute histograms

options(repr.plot.width=14, repr.plot.height=10)

sepallength<-ggplot(data = dados, mapping = aes(x = SepalLengthCm)) +
  geom_histogram(bins=30, fill = "red", color = "black", size = 0.5, alpha = .8) +
  theme_economist() +
  xlab("Sepal Length") +
  ggtitle("Sepal Length Histogram") 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sepalwidth<-ggplot(data = dados, mapping = aes(x = SepalWidthCm)) +
  geom_histogram(bins=30, fill = "#CC79A7", color = "black", size = 0.5, alpha = .8) +
  theme_economist() +
  xlab("Sepal Width") +
  ggtitle("Sepal Width Histogram") 

petallength<-ggplot(data = dados, mapping = aes(x = PetalLengthCm)) +
  geom_histogram(bins=30, fill = "blue", color = "black", size = 0.5, alpha = .8) +
  theme_economist() +
  xlab("Petal Length") +
  ggtitle("Petal Length Histogram") 

petalwidth<-ggplot(data = dados, mapping = aes(x = PetalWidthCm)) +
  geom_histogram(bins=30, fill = "#0072B2", color = "black", size = 0.5, alpha = .8) +
  theme_economist() +
  xlab("Petal Width") +
  ggtitle("Petal Width Histogram") 

plot_grid(sepallength, sepalwidth, petallength, petalwidth, nrow=2, ncol=2)

3.2 Species Analysis (with ggplot)

tema2 = theme(plot.title = element_text(size=15, hjust=.5),
             axis.title.x = element_text(size=10, color = "black"),
             axis.title.y = element_text(size=10, color = "black"),
             axis.text.x = element_text(size=10),
             axis.text.y = element_text(size=10),
             legend.position="bottom",
             legend.text = element_text(colour="black", size=10, face="bold"))

options(repr.plot.width=14, repr.plot.height=10)

sepallength <- ggplot(data = dados, mapping = aes(x = SepalLengthCm)) +
  geom_density(mapping = aes(fill = Species), color = "black", size = 0.6, alpha = .8) +
  theme_economist() +
  xlab("Sepal Length") +
  ggtitle("Sepal Length by Species") +
  tema2

sepalwidth <- ggplot(data = dados, mapping = aes(x = SepalWidthCm)) +
  geom_density(mapping = aes(fill = Species), color = "black", size = 0.6, alpha = .8) +
  theme_economist() +
  xlab("Sepal Width") +
  ggtitle("Sepal Width by Species") +
  tema2

petallength <- ggplot(data = dados, mapping = aes(x = PetalLengthCm)) +
  geom_density(mapping = aes(fill = Species), color = "black", size = 0.6, alpha = .8) +
  theme_economist() +
  xlab("Petal Length") +
  ggtitle("Petal Length by Species") +
  tema2

petalwidth <- ggplot(data = dados, mapping = aes(x = PetalWidthCm)) +
  geom_density(mapping = aes(fill = Species), color = "black", size = 0.6, alpha = .8) +
  theme_economist() +
  xlab("Petal Width") +
  ggtitle("Petal Width by Species") +
  tema2

plot_grid(sepallength, sepalwidth, petallength, petalwidth, ncol=2, nrow=2)

3.3 Species Analysis (with Ridgeline)

tema3 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1, color="white"),
              axis.title.y=element_text(size=10, vjust=2, color="white"),
              axis.title.x=element_text(size=10, vjust=-1, color="white"),
              axis.text.x=element_text(size=10, color="white"),
              axis.text.y=element_text(size=10, color="white"),
              legend.position="None")

options(repr.plot.width=17, repr.plot.height=13)

sepallength <- ggplot(data = dados, mapping = aes(x = SepalLengthCm, y = Species)) +
  geom_density_ridges(mapping = aes(fill = Species), bandwidth=0.181, color = "black", alpha = .8) +
  theme_solarized(light=FALSE)+
  scale_colour_solarized('blue')+
  xlab("Sepal Length") +
  ggtitle("Sepal Length by Species") +
  tema3

sepalwidth <- ggplot(data = dados, mapping = aes(x = SepalWidthCm, y = Species)) +
  geom_density_ridges(mapping = aes(fill = Species), bandwidth=0.134, color = "black", alpha = .8) +
  theme_economist() +
  theme_solarized(light=FALSE)+
  scale_colour_solarized('blue')+
  xlab("Sepal Width") +
  ggtitle("Sepal Width by Species") +
  tema3

petallength <- ggplot(data = dados, mapping = aes(x = PetalLengthCm, y = Species)) +
  geom_density_ridges(mapping = aes(fill = Species), bandwidth=0.155, color = "black", alpha = .8) +
  theme_economist() +
  theme_solarized(light=FALSE)+
  scale_colour_solarized('blue')+
  xlab("Petal Length") +
  ggtitle("Petal Length by Species") +
  tema3

petalwidth <- ggplot(data = dados, mapping = aes(x = PetalWidthCm, y = Species)) +
  geom_density_ridges(mapping = aes(fill = Species), bandwidth=0.075, color = "black", alpha = .8) +
  theme_economist() +
  theme_solarized(light=FALSE) +
  scale_colour_solarized('blue') +
  xlab("Petal Width") +
  ggtitle("Petal Width by Species") +
  tema3

plot_grid(sepallength, sepalwidth, petallength, petalwidth, ncol=2, nrow=2)

3.4 Violin Plots

tema4 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=12, vjust=2),
              axis.title.x=element_text(size=12, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="bottom",
              legend.text = element_text(colour="black", size=10, face="bold"))

sepallength <- ggplot(data = dados, mapping = aes(x=Species, y=SepalLengthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  theme_economist() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
  ggtitle("Sepal Length") +
  tema4

sepalwidth <- ggplot(data = dados, mapping = aes(x=Species, y=SepalWidthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  theme_economist() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
  ggtitle("Sepal Width") +
  tema4

petallength <- ggplot(data = dados, mapping = aes(x=Species, y=PetalLengthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  theme_economist() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
  ggtitle("Petal Length") +
  tema4

petalwidth <- ggplot(data = dados, mapping = aes(x=Species, y=PetalWidthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  theme_economist() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
  ggtitle("Petal Width") +
  tema4

plot_grid(sepallength, sepalwidth, petallength, petalwidth, ncol=2, nrow=2)

tema5 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=12, vjust=2),
              axis.title.x=element_text(size=12, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="bottom",
              legend.text = element_text(colour="black", size=10, face="bold"))

sepallength <- ggplot(data = dados, aes(x=Species, y=SepalLengthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  geom_boxplot(width=0.1, color="white", alpha=0.2, size = 1.2) +
  theme_economist() +
  ggtitle("Sepal Length") +
  tema5

sepalwidth <- ggplot(data = dados, aes(x=Species, y=SepalWidthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  geom_boxplot(width=0.1, color="white", alpha=0.2, size = 1.2) +
  theme_economist() +
  ggtitle("Sepal Width") +
  tema5

petallength <- ggplot(data = dados, aes(x=Species, y=PetalLengthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  geom_boxplot(width=0.1, color="white", alpha=0.2, size = 1.2) +
  theme_economist() +
  ggtitle("Petal Length") +
  tema5

petalwidth <- ggplot(data = dados, aes(x=Species, y=PetalWidthCm, fill=Species)) +
  geom_violin(size = 0.8) +
  geom_boxplot(width=0.1, color="white", alpha=0.2, size = 1.2) +
  theme_economist() +
  ggtitle("Petal Width") +
  tema5

plot_grid(sepallength, sepalwidth, petallength, petalwidth, ncol=2, nrow=2)

3.5 Boxplots

tema6 <- theme(
  plot.title=element_text(size=15, hjust=.5, vjust=1),
  axis.title.y=element_text(size=12, vjust=2),
  axis.title.x=element_text(size=12, vjust=-1),
  axis.text.x=element_text(size=10),
  axis.text.y=element_text(size=10),
  legend.position="bottom",
  legend.text = element_text(colour="black", size=10, face="bold"))

sepallength <- ggplot(data = dados, aes(x=SepalLengthCm, y=Species, fill=Species)) +
  geom_boxplot(size = 0.8) +
  stat_boxplot(geom="errorbar")+
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_fivethirtyeight() +
  ggtitle("Sepal Length") +
  tema6

sepalwidth <- ggplot(data = dados, aes(x=SepalWidthCm, y=Species, fill=Species)) +
  geom_boxplot(size = 1.3) +
  stat_boxplot(geom="errorbar")+
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_fivethirtyeight() +
  ggtitle("Sepal Width") +
  tema6

petallength <- ggplot(data = dados, aes(x=PetalLengthCm, y=Species, fill=Species)) +
  geom_boxplot(size = 1.3) +
  stat_boxplot(geom="errorbar")+
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_fivethirtyeight() +
  ggtitle("Petal Length") +
  tema6

petalwidth <- ggplot(data = dados, aes(x=PetalWidthCm, y=Species, fill=Species)) +
  geom_boxplot(size = 1.3) +
  stat_boxplot(geom="errorbar")+
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_fivethirtyeight() +
  ggtitle("Petal Width") +
  tema6

plot_grid(sepallength, sepalwidth, petallength, petalwidth, ncol=2, nrow=2)

3.6 Scatter Plots

tema7 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=12, vjust=2),
              axis.title.x=element_text(size=12, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="bottom",
              legend.text = element_text(colour="black", size=10, face="bold"))

options(repr.plot.width=17, repr.plot.height=7)

a<-ggplot(data = dados, mapping = aes(x = SepalLengthCm, y = SepalWidthCm)) +
  geom_point(mapping = aes(color = Species, shape = Species), size = 5) +
  theme_economist() +
  tema7

b<-ggplot(data = dados, mapping = aes(x = PetalLengthCm, y = PetalWidthCm)) +
  geom_point(mapping = aes(color = Species, shape = Species), size = 5) +
  theme_economist() +
  tema7

plot_grid(a, b, ncol=2, nrow=1)

tema8 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=12, vjust=2),
              axis.title.x=element_text(size=12, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="bottom",
              strip.text.x=element_text(size=22, color="black"),
              legend.text = element_text(colour="black", size=15, face="bold"))

options(repr.plot.width=14, repr.plot.height=7)

ggplot(data = dados) +
  geom_point(mapping = aes(x = SepalLengthCm, y = SepalWidthCm, color=Species, shape=Species), 
             size = 4.5) +
  facet_wrap(~ Species, ncol=3) +
  theme_economist() +
  tema8

tema9 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=12, vjust=2),
              axis.title.x=element_text(size=12, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="bottom",
              strip.text.x=element_text(size=22, color="black"),
              legend.text = element_text(colour="black", size=15, face="bold"))

options(repr.plot.width=14, repr.plot.height=7)

ggplot(data = dados) +
  geom_point(mapping = aes(x = PetalLengthCm, y = PetalWidthCm, color=Species, shape=Species), size = 4.5) +
  facet_wrap(~ Species, ncol=3) +
  theme_economist() +
  tema9

3.7 Correlation Graphs

df <- select(dados, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species)

ggcorr(df)
## Warning in ggcorr(df): data in column(s) 'Species' are not numeric and were
## ignored

tema10 <- theme(plot.title=element_text(size=15, hjust=.5, vjust=1),
              axis.title.y=element_text(size=10, vjust=2),
              axis.title.x=element_text(size=5, vjust=-1),
              axis.text.x=element_text(size=10),
              axis.text.y=element_text(size=10),
              legend.position="none",
              strip.text.x=element_text(size=15, color="black"),
              strip.text.y=element_text(size=15, color="black"),
              legend.text = element_text(colour="black", size=12, face="bold"))

options(repr.plot.width=10, repr.plot.height=9)

ggpairs(df, columns = 2:4, ggplot2::aes(colour=Species)) +
  theme_economist() +
  tema10

Reference

  1. The exercise displayed in this page is based on the work by MURILÃO on Kaggle.