# Copyright 2010-2021 Meik Michalke # # This file is part of the R package koRpus. # # koRpus is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # koRpus is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with koRpus. If not, see . # internal function to produce the word class distribution table # wclass: object@tokens[["wclass"]] # lang: object@lang # abs: if not NULL, percentages will also be calculated relative to this number wClassNoPunct <- function(wclass, lang, abs=NULL){ word.tags <- kRp.POS.tags(lang, list.classes=TRUE, tags="words") wclass.num <- summary(as.factor(wclass)) wclass.nopunct <- names(wclass.num)[names(wclass.num) %in% word.tags] wclass.punct <- names(wclass.num)[!names(wclass.num) %in% word.tags] wclass.nopunct.num <- wclass.num[wclass.nopunct] wclass.punct.num <- wclass.num[wclass.punct] wclass.nopunct.num <- wclass.nopunct.num[order(wclass.nopunct.num, decreasing=TRUE)] if(is.null(abs)){ wclass.nopunct.num <- rbind(wclass.nopunct.num, 100 * wclass.nopunct.num / sum(wclass.nopunct.num)) rownames(wclass.nopunct.num) <- c("num", "pct") } else { wclass.nopunct.num <- rbind(wclass.nopunct.num, 100 * wclass.nopunct.num / sum(wclass.nopunct.num), 100 * wclass.nopunct.num / abs) rownames(wclass.nopunct.num) <- c("num", "pct", "pct.abs") } wclass.nopunct.num <- t(wclass.nopunct.num) if(length(wclass.punct) != 0){ if(ncol(wclass.nopunct.num) > 2){ wclass.nopunct.num <- rbind(wclass.nopunct.num, cbind(wclass.punct.num, NA, NA)) } else { wclass.nopunct.num <- rbind(wclass.nopunct.num, cbind(wclass.punct.num, NA)) } } else {} return(wclass.nopunct.num) } #' @param index Either a vector indicating which rows should be considered as transformed for the statistics, #' or the name of a particular transformation that was previously done to the object, if more than one transformation was applied. #' If \code{NA}, all rows where \code{"equal"} is \code{FALSE} are used. #' Only valid for objects providing a \code{diff} feature. #' @param feature A character string naming a feature present in the object, to trigger a summary regarding that feature. #' Currently only \code{"freq"}, \code{"lex_div"}, and \code{"readability"} are implemented. #' @param flat Logical, if \code{TRUE} and \code{feature="lex_div"} or \code{"readability"}, a named vector of main #' results is returned. For objects containig more than one \code{doc_id}, defaults to \code{TRUE} automatically and #' returns a data frame with named rows. #' @include 01_class_01_kRp.text.R #' @include 02_method_summary.kRp.lang.R #' @export #' @docType methods #' @rdname summary-methods #' @aliases summary,kRp.text-method #' @example inst/examples/if_lang_en_clause_start.R #' @example inst/examples/define_sample_file.R #' @examples #' tokenized.obj <- tokenize( #' txt=sample_file, #' lang="en" #' ) #' # this will look more useful when you #' # can use treetag() instead of tokenize() #' summary(tokenized.obj) #' @example inst/examples/if_lang_en_clause_end.R setMethod("summary", signature(object="kRp.text"), function(object, index=NA, feature=NULL, flat=FALSE){ if(identical(feature, "freq")){ stopifnot(hasFeature(object, "freq")) summary.table <- t(data.frame( sentences=describe(object)[["sentences"]], avg.sentence.length=describe(object)[["avg.sentc.length"]], words=describe(object)[["words"]], avg.word.length=describe(object)[["avg.word.length"]], all.characters=describe(object)[["all.chars"]], letters=describe(object)[["letters"]][["all"]], lemmata=describe(object)[["lemmata"]], questions=describe(object)[["questions"]], exclamations=describe(object)[["exclam"]], semicolon=describe(object)[["semicolon"]], colon=describe(object)[["colon"]], stringsAsFactors=FALSE)) colnames(summary.table) <- "freq" return(summary.table) } else if(identical(feature, "lex_div")){ stopifnot(hasFeature(object, "lex_div")) object_lexdiv <- corpusLexDiv(object) if(length(object_lexdiv) > 1){ summary.table <- t(as.data.frame( lapply( object_lexdiv, summary, flat=TRUE ) )) } else { summary.table <- summary(object_lexdiv[[1]], flat=flat) } return(summary.table) } else if(identical(feature, "readability")){ stopifnot(hasFeature(object, "readability")) object_rdb <- corpusReadability(object) if(length(object_rdb) > 1){ summary.table <- t(as.data.frame( lapply( object_rdb, summary, flat=TRUE ) )) } else { summary.table <- summary(object_rdb[[1]], flat=flat) } return(summary.table) } else { # to prevent hiccups from R CMD check Row.names <- NULL desc <- describe(object) lang <- language(object) tokens <- taggedText(object) wclass.nopunct.num <- wClassNoPunct(wclass=tokens[["wclass"]], lang=lang) if(hasFeature(object, "diff")){ wclass.orig.order <- order(order(rownames(wclass.nopunct.num))) if(isTRUE(is.na(index))){ wclass.index <- !tokens[["equal"]] } else if(is.character(index)){ if(length(index) > 1){ stop(simpleError(paste0("If \"index\" is character, it must be a single value!"))) } else {} diffObj <- diffText(object) if(index %in% colnames(diffObj[["transfmt.equal"]])){ indexPosition <- which(colnames(diffObj[["transfmt.equal"]]) %in% index) if(length(indexPosition) > 1){ warning(paste0("Index \"", index,"\" found multiple times, using last occurrence only!"), call.=FALSE) indexPosition <- max(indexPosition) } else {} } else { stop(simpleError(paste0("Transformation data \"", index,"\" not found in object!"))) } wclass.index <- !diffObj[["transfmt.equal"]][[indexPosition]] } else { wclass.index <- index } wclass.nopunct.num.transfmt <- wClassNoPunct(wclass=tokens[wclass.index,"wclass"], lang=lang, abs=desc[["words"]]) colnames(wclass.nopunct.num.transfmt) <- c("num.transfmt", "pct.transfmt", "pct.transfmt.abs") wclass.nopunct.num <- merge(wclass.nopunct.num, wclass.nopunct.num.transfmt, all=TRUE, by='row.names', sort=FALSE, suffixes=c("", ".transfmt")) # merge adds a column for row numbers, reverse that rownames(wclass.nopunct.num) <- wclass.nopunct.num[["Row.names"]] wclass.nopunct.num <- subset(wclass.nopunct.num, select=-Row.names) # regain original order wclass.nopunct.num <- wclass.nopunct.num[order(rownames(wclass.nopunct.num))[wclass.orig.order],] # add another column for the percentage of words of each class which were removed wclass.nopunct.num[["pct.transfmt.wclass"]] <- wclass.nopunct.num[["num.transfmt"]] * 100 / wclass.nopunct.num[["num"]] # correct for possible division by zero, NaN looks confusing here wclass.nopunct.num[is.nan(wclass.nopunct.num[["pct.transfmt.wclass"]]), "pct.transfmt.wclass"] <- 0 } else {} cat( "\n Sentences: ", desc[["sentences"]], "\n", " Words: ", desc[["words"]], " (", round(desc[["avg.sentc.length"]], digits=2), " per sentence)\n", " Letters: ", desc[["letters"]][["all"]], " (", round(desc[["avg.word.length"]], digits=2), " per word)\n\n Word class distribution:\n\n", sep="") return(wclass.nopunct.num) } })