##################################################################################### # R-scrip for workshop on “Application of Machine Learning in Agriculture # # and Livestock Production” Presented at “Application of statistical procedure # # in biological data” series by Saleh Shahinfar on Dec 10th 2021. # # shahinfar@uwalumni.com # ##################################################################################### rm(list=ls()) options(warn=-1) setwd("\\Desktop/ML_Course") #calculating model performance metrics MPM <- function(predicted, actual) { CM <- table(as.factor(predicted), as.factor(actual)) storage.mode(CM) <- "numeric" # this is to avoid integer overflow error n <- sum(CM) diag <- diag(CM) FP <- rowSums(CM) - diag(CM) FN <- colSums(CM) - diag(CM) TP <- diag(CM) TN <- sum(CM) - (TP + FP + FN) TPR <- round(TP/(TP + FN), 2) # true positive rate, recall, Sensitivity, hit rate TNR <- round(TN/(TN + FP), 2) # true negative rate or Specificity PRE <- round(TP/(TP + FP), 2) # Precision or positive predictive value FPR <- round(FP/(FP + TN), 2) # false positive rate pr Fall out rate ACC <- round((TP + TN)/(TP + TN + FP + FN), 2) # OVERALL ACCURACY, correctly classified Instances F1 <- round(((2 * TP)/((2 * TP) + FP + FN)), 2) return(list(ACC = ACC, Precision = PRE, TPR = TPR, TNR = TNR, FPR = FPR, F1 = F1)) } MPM_2 <- function(predicted, actual) { yHat <- as.numeric(predicted) y <- as.numeric(actual) if (sum(yHat) == length(yHat)) { yHat[1] <- 0 } else if (sum(yHat) == 0) { yHat[1] <- 1 } else if (sum(y) == length(y)) { yHat[1] <- 0 } else if (sum(y) == 0) { yHat[1] <- 1 } CM <- table(as.factor(yHat), as.factor(y)) storage.mode(CM) <- "numeric" # this is to avoid integer overflow error n <- sum(CM) diag <- diag(CM) TP <- CM[2, 2] TN <- CM[1, 1] FP <- CM[2, 1] FN <- CM[1, 2] ACC <- round((sum(diag)/n), 2) Precision <- round( TP/(TP + FP), 2) TPR <- round( TP/(TP + FN), 2) TNR <- round( TN/(TN + FP), 2) FPR <- round( FP/(FP + TN), 2) F1 <- round(((2 * TP)/((2 * TP) + FP + FN)), 2) return(list(ACC = ACC, Precision = Precision, TPR = TPR, TNR = TNR, FPR = FPR, F1 = F1)) } normalize <- function(x) { return((x - min(x))/(max(x) - min(x))) } ############################################# #$ Naive Bayes $# ############################################# #Measurements of geometrical properties of kernels belonging to three different #varieties of wheat. #A soft X-ray technique and GRAINS package were used to construct all seven, #real-valued attributes. #https://archive.ics.uci.edu/ml/datasets/seeds rm(list=ls()) Wheat <- read.csv("./Seed_data/seeds_dataset.csv", sep = ",", header = TRUE) # making the train and the test data set Wheat<-Wheat[sample(nrow(Wheat)),] W_tr <- sample(dim(Wheat)[1], 150) d_tr <- Wheat[W_tr, ] d_ts <- Wheat[-W_tr, ] str(Wheat) library(e1071) NB <- naiveBayes(Wheat_type ~ ., data = d_tr) NB NB_yHat <- predict(NB, newdata = d_ts[, 1:7]) table(NB_yHat, d_ts[, 8]) MPM(NB_yHat, d_ts[, 8]) NBProb <- predict(NB, newdata = d_ts[, 1:7], type = "raw") head(NBProb) ############################################# #$ Decision trees $# ############################################# # here we want to predict incidence of lameness in dairy cows by Decision Tree algorithm # using various production and linear and composite type traits. # this is a prediction at cow level. rm(list=ls()) Lameness_data <- read.csv("./Lameness_data/lame.csv", sep = ",", header = TRUE) Lameness_data<- cbind(Lameness_data[,1:2], sapply(Lameness_data[,3:30], normalize)) str(Lameness_data) # making the train and the test data set set.seed(115) tr <- sample(dim(Lameness_data)[1], 410) d_tr <- Lameness_data[tr, ] d_tr$Lameness2 <- as.factor(d_tr$Lameness2) d_ts <- Lameness_data[-tr, ] library(tree) lame <- tree(Lameness2 ~ ., data = d_tr) summary(lame) plot(lame) text(lame, cex = 0.65, pretty = FALSE) tsize <- summary(lame)$size #to store size of unpruned tree cv_lame <- cv.tree(lame, K = 3) plot(cv_lame$size, cv_lame$dev, type = "b") pruned_lame <- prune.tree(lame, best = 6) plot(pruned_lame) text(pruned_lame, pretty = 0) p_tree_yHat <- predict(pruned_lame, newdata = d_ts, type = "class") table(p_tree_yHat, d_ts[, 1]) MPM_2(p_tree_yHat, d_ts[, 1]) ############################################# #$ Random Forest $# ############################################# # measurements of NIR on 160 wavelength on different Wheat line and their measured # hagberg falling number (which is a proxy for Late MAturity Alpha Amylase Activity) is included in this dataset. # Using RandomForest algorithm we want to predict Falling number for each wheat line. rm(list=ls()) NIR<-read.csv('./Grain_NIR/NIR.csv', header=T) # making the train and the test data set NIR <- NIR[sample(nrow(NIR)), ] d_NIR <- sample(dim(NIR)[1], 1200) NIR_tr <- NIR[d_NIR[1:1000], ] NIR_ts <- NIR[d_NIR[1001:1200], ] library(randomForest) metrics <- NULL for (l in seq(5, (ncol(NIR_tr)/2), by = 10)) { for (n in seq(40, 120, by = 20)) { RF <- randomForest(FN ~ ., data = NIR_tr, ntree = n, mtry = l, importance = FALSE) RF_yHat <- predict(RF, newdata = NIR_ts) CC_RF <- round(cor(RF_yHat, NIR_ts$FN), 3) MAE_RF <- round(mean(abs(RF_yHat - NIR_ts$FN)), 3) cat(l, " ", n, " ", CC_RF, " ", MAE_RF, " ", "\n") metrics <- rbind(metrics, cbind(l, n, CC_RF, MAE_RF)) } } metrics <- data.frame(metrics) library(reshape) h <- data.matrix(cast(metrics[, 1:3], l ~ n)) h_cc <- h[, 2:ncol(h)] row.names(h_cc) <- h[, 1] heatmap(h_cc, Rowv = NA, Colv = NA, scale = "none", col = hcl.colors(50), margins = c(5, 10)) #h <- data.matrix(cast(metrics[, c(1, 2, 4)], l ~ n)) #h_mae <- h[, 2:ncol(h)] #row.names(h_mae) <- h[, 1] #heatmap(h_mae, Rowv = NA, Colv = NA, scale = "none", margins = c(5, 10), cellnote = h_mae) ############################################# #$ Artificial Neural Networks $# ############################################# #The data from an Australian Merino Sheep flock. #Each row contains information from each yearling merino lamb. #The last column is wool production of the same sheep when they are adult (our trait of interest for prediction). # we want to Train a neural network to predict adult wool production using all of the attributes available set.seed(330) Fleece <- read.csv("./Sheep_Wool/FleeceWeight.csv", header = TRUE, stringsAsFactors = F) str(Fleece) normalize <- function(x) { return((x - min(x))/(max(x) - min(x))) } Fleece_norm <- as.data.frame(lapply(Fleece, normalize)) # check for normalized sample summary(Fleece_norm$AdultWool) # compare with the original summary(Fleece$AdultWool) # making the train and the test data set train <- sample(4000, 3000) Fleece_train <- Fleece_norm[train, ] Fleece_test <- Fleece_norm[-train, 1:15] library(neuralnet) ANN_1 <- neuralnet(AdultWool ~ SEX + BT + SPINFINE + yWool + yearlingAGE + AdultAge + Dag + CCov + BDWR + Pregnancy + BCS + Weight + DryWt + Rain + Temp, data = Fleece_train, hidden = 1) plot(ANN_1) modelResults <- compute(ANN_1, Fleece_test) # to validate our model on testing set Predicted_Fleece <- modelResults$net.result cor(Fleece_norm[-train, 16], Predicted_Fleece) mae <- (sum(abs(Fleece_norm[-train, 16] - Predicted_Fleece))/dim(Fleece_test)[1]) mae plot(Fleece_norm[-train, 16], Predicted_Fleece, xlim = c(0, 1), ylim = c(0, 1), abline(coef = c(0, 1))) # fitting a Neural Network with Multiple hidden nodes cc <- vector(mode = "numeric", length = 7) for (i in 1:7) { ANN <- neuralnet(AdultWool ~ . , data = Fleece_train, hidden = i, threshold = 0.01, stepmax = 1e+07) modelResults <- compute(ANN, Fleece_test) Predicted_Fleece <- modelResults$net.result cc[i] <- cor(Fleece_norm[-train, 16], Predicted_Fleece) } # draw a plot plot(c(1:7), cc[1:7], xlab = "Number of Hidden Neurons", ylab = " Correlation", pch = 19) lines(c(1:7), cc[1:7], type = "l") ########### ANN with 2 hidden layers ANN_5_3 <- neuralnet(AdultWool ~ . , data = Fleece_train, hidden = c(5, 3), stepmax = 1e+07) plot(ANN_5_3) modelResults <- compute(ANN_5_3, Fleece_test) Predicted_Fleece <- modelResults$net.result cor(Fleece_norm[-train, 16], Predicted_Fleece) mae <- (sum(abs(Fleece_norm[-train, 16] - Predicted_Fleece))/dim(Fleece_test)[1]) mae plot(Fleece_norm[-train, 16], Predicted_Fleece, xlim = c(0, 1), ylim = c(0, 1), abline(coef = c(0, 1))) ##################################################################################### # R-scrip for workshop on “Application of Machine Learning in Agriculture # # and Livestock Production” Presented at “Application of statistical procedure # # in biological data” series by Saleh Shahinfar on Dec 10th 2021. # # shahinfar@uwalumni.com # #####################################################################################