# Copyright 2010-2021 Meik Michalke # # This file is part of the R package koRpus. # # koRpus is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # koRpus is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with koRpus. If not, see . #' Getter/setter methods for koRpus objects #' #' These methods should be used to get or set values of tagged text objects #' generated by koRpus functions like \code{\link[koRpus:treetag]{treetag}} or \code{\link[koRpus:tokenize]{tokenize}}. #' #' \itemize{ #' \item{\code{taggedText()} }{returns the \code{tokens} slot.} #' \item{\code{doc_id()} }{Returns a character vector of all \code{doc_id} values in the object.} #' \item{\code{describe()} }{returns the \code{desc} slot.} #' \item{\code{language()} }{returns the \code{lang} slot.} #' \item{\code{[}/\code{[[} }{Can be used as a shortcut to index the results of \code{taggedText()}.} #' \item{\code{fixObject} }{returns the same object upgraded to the object structure of this package version (e.g., new columns, changed names, etc.).} #' \item{\code{hasFeature()} }{returns \code{TRUE} or code{FALSE}, depending on whether the requested feature is present or not.} #' \item{\code{feature()} }{returns the list entry of the \code{feat_list} slot for the requested feature.} #' \item{\code{corpusReadability()} }{returns the list of \code{kRp.readability} objects, see \code{\link[koRpus:readability]{readability}}.} #' \item{\code{corpusHyphen()} }{returns the list of \code{kRp.hyphen} objects, see \code{\link[koRpus:hyphen]{hyphen}}.} #' \item{\code{corpusLexDiv()} }{returns the list of \code{kRp.TTR} objects, see \code{\link[koRpus:lex.div]{lex.div}}.} #' \item{\code{corpusFreq()} }{returns the frequency analysis data from the \code{feat_list} slot, see \code{\link[koRpus:freq.analysis]{freq.analysis}}.} #' \item{\code{corpusCorpFreq()} }{returns the \code{kRp.corp.freq} object of the \code{feat_list} slot, see for example \code{\link[koRpus:read.corp.custom]{read.corp.custom}}.} #' \item{\code{corpusStopwords()} }{returns the number of stopwords found in each text (if analyzed) from the \code{feat_list} slot.} #' \item{\code{tif_as_tokens_df} }{returns the \code{tokens} slot in a TIF[1] compliant format, i.e., \code{doc_id} is not a factor but a character vector.} #' \item{\code{originalText()} }{similar to \code{taggedText()}, but reverts any transformations back to the original text before returning the \code{tokens} slot. #' Only works if the object has the feature \code{diff}, see examples.} #' \item{\code{diffText()} }{returns the \code{diff} slot, if present.} #' } #' #' @param add.desc Logical, determines whether the \code{desc} column should be re-written with descriptions #' for all POS tags. #' @param doc_id Logical (except for \code{fixObject}, \code{feature}, and \code{[[/[[<-}), if \code{TRUE} the \code{doc_id} column will be a factor with the respective value #' of the \code{desc} slot, i.\,e., the document ID will be preserved in the data.frame. If used with \code{fixObject}, can be a character string #' to set the document ID manually (the default \code{NA} will preserve existing values and not overwrite them). If used with \code{feature} or \code{[[/[[<-}, #' a character vector to limit the scope to one or more particular document IDs. #' @param ... Additional arguments for the generics. #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @references #' [1] Text Interchange Formats (\url{https://github.com/ropensci/tif}) #' @example inst/examples/if_lang_en_clause_start.R #' @example inst/examples/define_sample_file.R #' @examples #' tokenized.obj <- tokenize( #' txt=sample_file, #' lang="en" #' ) #' #' doc_id(tokenized.obj) #' #' describe(tokenized.obj) #' #' language(tokenized.obj) #' #' taggedText(tokenized.obj) #' tokenized.obj[["token"]] #' tokenized.obj[1:3, "token"] #' #' tif_as_tokens_df(tokenized.obj) #' #' # example for originalText() #' tokenized.obj <- jumbleWords(tokenized.obj) #' # now compare the jumbled words to the original #' tokenized.obj[["token"]] #' originalText(tokenized.obj)[["token"]] #' @example inst/examples/if_lang_en_clause_end.R setGeneric("taggedText", function(obj, add.desc=FALSE, doc_id=FALSE) standardGeneric("taggedText")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' taggedText,-methods #' taggedText,kRp.text-method #' @include koRpus-internal.R setMethod("taggedText", signature=signature(obj="kRp.text"), function (obj, add.desc=FALSE, doc_id=FALSE){ result <- slot(obj, name="tokens") if(isTRUE(add.desc)){ result[["desc"]] <- explain_tags( tags=result[["tag"]], lang=language(obj), cols="desc" ) } else {} if(isTRUE(doc_id)){ result[["doc_id"]] <- as.factor(describe(obj)[["doc_id"]]) } else {} return(result) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @param value The new value to replace the current with. setGeneric("taggedText<-", function(obj, value) standardGeneric("taggedText<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' taggedText<-,-methods #' taggedText<-,kRp.text-method setMethod("taggedText<-", signature=signature(obj="kRp.text"), function (obj, value){ slot(obj, name="tokens") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods setGeneric("doc_id", function(obj, ...) standardGeneric("doc_id")) #' @rdname kRp.text_get-methods #' @param has_id A character vector with \code{doc_id}s to look for in the object. The return value #' is then a logical vector of the same length, indicating if the respective id was found or not. #' @export #' @docType methods #' @aliases #' doc_id,-methods #' doc_id,kRp.text-method setMethod("doc_id", signature=signature(obj="kRp.text"), function (obj, has_id=NULL){ result <- unique(as.character(slot(obj, name="tokens")[["doc_id"]])) if(is.null(has_id)){ return(result) } else if(length(has_id) > 1) { return(has_id %in% result) } else { return(any(result == has_id)) } } ) #' @rdname kRp.text_get-methods #' @docType methods #' @param feature Character string naming the feature to look for. The return value is logical if a single feature #' name is given. If \code{feature=NULL}, a character vector is returned, naming all features found in the object. #' @export setGeneric("hasFeature", function(obj, feature=NULL, ...) standardGeneric("hasFeature")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' hasFeature,-methods #' hasFeature,kRp.text-method setMethod("hasFeature", signature=signature(obj="kRp.text"), function (obj, feature=NULL){ if(is.null(feature)){ features <- slot(obj, "features") return(names(features[features])) } else { return(isTRUE(slot(obj, name="features")[feature])) } } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods setGeneric("hasFeature<-", function(obj, feature, value) standardGeneric("hasFeature<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' hasFeature<-,-methods #' hasFeature<-,kRp.text-method setMethod("hasFeature<-", signature=signature(obj="kRp.text"), function (obj, feature, value){ if(!is.logical(value)){ stop(simpleError("The \"feature\" value must be logical!")) } else {} if(isTRUE(value)){ slot(obj, name="features")[feature] <- value } else { current_features <- slot(obj, name="features") slot(obj, name="features") <- current_features[!names(current_features) %in% feature] } return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("feature", function(obj, feature, ...) standardGeneric("feature")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' feature,-methods #' feature,kRp.text-method setMethod("feature", signature=signature(obj="kRp.text"), function (obj, feature, doc_id=NULL){ if(is.null(doc_id)){ return(slot(obj, name="feat_list")[[feature]]) } else { doc_ids_in_obj <- doc_id(obj, has_id=doc_id) if(all(doc_ids_in_obj)){ return(slot(obj, name="feat_list")[[feature]][doc_id]) } else { warning( paste0("Invalid doc_id, omitted:\n \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\""), call.=FALSE ) return(slot(obj, name="feat_list")[[feature]][doc_id[doc_ids_in_obj]]) } } } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("feature<-", function(obj, feature, value) standardGeneric("feature<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' feature<-,-methods #' feature<-,kRp.text-method setMethod("feature<-", signature=signature(obj="kRp.text"), function (obj, feature, value){ slot(obj, name="feat_list")[[feature]] <- value if(is.null(value)){ hasFeature(obj, feature) <- FALSE } else { hasFeature(obj, feature) <- TRUE } return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusReadability", function(obj, ...) standardGeneric("corpusReadability")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusReadability,-methods #' corpusReadability,kRp.text-method setMethod("corpusReadability", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL){ return(feature(obj, "readability", doc_id=doc_id)) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusReadability<-", function(obj, value) standardGeneric("corpusReadability<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusReadability<-,-methods #' corpusReadability<-,kRp.text-method setMethod("corpusReadability<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "readability") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusHyphen", function(obj, ...) standardGeneric("corpusHyphen")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusHyphen,-methods #' corpusHyphen,kRp.text-method setMethod("corpusHyphen", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL){ return(feature(obj, "hyphen", doc_id=doc_id)) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusHyphen<-", function(obj, value) standardGeneric("corpusHyphen<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusHyphen<-,-methods #' corpusHyphen<-,kRp.text-method setMethod("corpusHyphen<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "hyphen") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusLexDiv", function(obj, ...) standardGeneric("corpusLexDiv")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusLexDiv,-methods #' corpusLexDiv,kRp.text-method setMethod("corpusLexDiv", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL){ return(feature(obj, "lex_div", doc_id=doc_id)) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusLexDiv<-", function(obj, value) standardGeneric("corpusLexDiv<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusLexDiv<-,-methods #' corpusLexDiv<-,kRp.text-method setMethod("corpusLexDiv<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "lex_div") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusFreq", function(obj, ...) standardGeneric("corpusFreq")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusFreq,-methods #' corpusFreq,kRp.text-method setMethod("corpusFreq", signature=signature(obj="kRp.text"), function (obj){ return(feature(obj, "freq")) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusFreq<-", function(obj, value) standardGeneric("corpusFreq<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusFreq<-,-methods #' corpusFreq<-,kRp.text-method setMethod("corpusFreq<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "freq") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusCorpFreq", function(obj, ...) standardGeneric("corpusCorpFreq")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusCorpFreq,-methods #' corpusCorpFreq,kRp.text-method setMethod("corpusCorpFreq", signature=signature(obj="kRp.text"), function (obj){ return(feature(obj, "corp_freq")) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusCorpFreq<-", function(obj, value) standardGeneric("corpusCorpFreq<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusCorpFreq<-,-methods #' corpusCorpFreq<-,kRp.text-method setMethod("corpusCorpFreq<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "corp_freq") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("corpusStopwords", function(obj, ...) standardGeneric("corpusStopwords")) #' @rdname kRp.text_get-methods #' @docType methods #' @export #' @aliases #' corpusStopwords,-methods #' corpusStopwords,kRp.text-method setMethod("corpusStopwords", signature=signature(obj="kRp.text"), function (obj){ return(feature(obj, "stopwords")) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods # @param value The new value to replace the current with. setGeneric("corpusStopwords<-", function(obj, value) standardGeneric("corpusStopwords<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' corpusStopwords<-,-methods #' corpusStopwords<-,kRp.text-method setMethod("corpusStopwords<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "stopwords") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @param x An object of class \code{kRp.text} or \code{kRp.hyphen}. #' @param i Defines the row selector (\code{[}) or the name to match (\code{[[}). #' @param j Defines the column selector. #' @param drop Logical, whether the result should be coerced to the lowest possible dimension. See \code{\link[base:Extract]{[}} for more details. #' @export #' @docType methods #' @aliases #' [,-methods #' [,kRp.text,ANY,ANY-method setMethod("[", signature=signature(x="kRp.text"), function (x, i, j, ...){ return(taggedText(x)[i, j, ...]) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' [<-,-methods #' [<-,kRp.text,ANY,ANY,ANY-method setMethod("[<-", signature=signature(x="kRp.text"), function (x, i, j, ..., value){ taggedText(x)[i, j, ...] <- value return(x) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' [[,-methods #' [[,kRp.text,ANY-method setMethod("[[", signature=signature(x="kRp.text"), function (x, i, doc_id=NULL, ...){ if(is.null(doc_id)){ return(taggedText(x)[[i, ...]]) } else { doc_ids_in_obj <- doc_id(x, has_id=doc_id) tt <- taggedText(x) if(all(doc_ids_in_obj)){ return(tt[tt[["doc_id"]] %in% doc_id, i, ...]) } else { warning( paste0("Invalid doc_id, omitted:\n \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\""), call.=FALSE ) return(tt[tt[["doc_id"]] %in% doc_id[doc_ids_in_obj], i, ...]) } } } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' [[<-,-methods #' [[<-,kRp.text,ANY,ANY-method setMethod("[[<-", signature=signature(x="kRp.text"), function (x, i, doc_id=NULL, ..., value){ if(is.null(doc_id)){ taggedText(x)[[i]] <- value } else { doc_ids_in_obj <- doc_id(x, has_id=doc_id) tt <- taggedText(x) if(all(doc_ids_in_obj)){ tt[tt[["doc_id"]] %in% doc_id, i, ...] <- value taggedText(x) <- tt } else { stop(simpleError( paste0("Invalid doc_id:\n \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!") )) } } return(x) } ) ## the standard generic for describe() is defined in the sylly package #' @importFrom sylly describe #' @param simplify Logical, if \code{TRUE} and the result is a list oft length one (i.e., just a single \code{doc_id}), #' returns the contents of the single list entry. #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' describe,-methods #' describe,kRp.text-method setMethod("describe", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL, simplify=TRUE, ...){ result <- slot(obj, name="desc") if(!is.null(doc_id)){ doc_ids_in_obj <- doc_id(obj, has_id=doc_id) if(all(doc_ids_in_obj)){ result <- result[names(result) %in% doc_id] } else { stop(simpleError( paste0("Invalid doc_id:\n \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!") )) } } else {} if(all(isTRUE(simplify), length(result) == 1)){ result <- result[[1]] } else {} return(result) } ) ## the standard generic for describe()<- is defined in the sylly package #' @importFrom sylly describe<- #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' describe<-,-methods #' describe<-,kRp.text-method setMethod("describe<-", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL, ..., value){ if(is.null(doc_id)){ slot(obj, name="desc") <- value } else { doc_ids_in_obj <- doc_id(obj, has_id=doc_id) if(all(doc_ids_in_obj)){ if(length(doc_id) > 1){ slot(obj, name="desc")[doc_id] <- value } else { slot(obj, name="desc")[[doc_id]] <- value } } else { stop(simpleError( paste0("Invalid doc_id:\n \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!") )) } } return(obj) } ) ## the standard generic for language() is defined in the sylly package #' @importFrom sylly language #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' language,-methods #' language,kRp.text-method setMethod("language", signature=signature(obj="kRp.text"), function (obj){ result <- slot(obj, name="lang") return(result) } ) ## the standard generic for language()<- is defined in the sylly package #' @importFrom sylly language<- #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' language<-,-methods #' language<-,kRp.text-method setMethod("language<-", signature=signature(obj="kRp.text"), function (obj, value){ slot(obj, name="lang") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods setGeneric("diffText", function(obj, doc_id=NULL) standardGeneric("diffText")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' diffText,-methods #' diffText,kRp.text-method setMethod("diffText", signature=signature(obj="kRp.text"), function (obj, doc_id=NULL){ if(hasFeature(obj, "diff")){ result <- feature(obj, "diff", doc_id=doc_id) return(result) } else { warning("There is no feature \"diff\" in this object!") return(invisible(NULL)) } } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods setGeneric("diffText<-", function(obj, value) standardGeneric("diffText<-")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' diffText<-,-methods #' diffText<-,kRp.text-method setMethod("diffText<-", signature=signature(obj="kRp.text"), function (obj, value){ feature(obj, "diff") <- value return(obj) } ) #' @rdname kRp.text_get-methods #' @export #' @docType methods setGeneric("originalText", function(obj) standardGeneric("originalText")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' originalText,-methods #' originalText,kRp.text-method setMethod("originalText", signature=signature(obj="kRp.text"), function (obj){ return(txt_trans_revert_orig(tokens=taggedText(obj))) } ) #' @param obj An arbitrary \code{R} object. #' @rdname kRp.text_get-methods #' @export is.taggedText <- function(obj){ inherits(obj, "kRp.text") } #' @rdname kRp.text_get-methods #' @export is.kRp.text <- function(obj){ inherits(obj, "kRp.text") } #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("fixObject", function(obj, doc_id=NA) standardGeneric("fixObject")) #' @rdname kRp.text_get-methods #' @export #' @docType methods #' @aliases #' fixObject,-methods #' fixObject,kRp.text-method setMethod("fixObject", signature=signature(obj="kRp.text"), function (obj, doc_id=NA){ currentDf <- slot(obj, "tokens") currentDesc <- slot(obj, "desc") currentCols <- colnames(currentDf) newDf <- init.kRp.text.df(rows=nrow(currentDf)) # move all present columns to the new data.frame newDf[,currentCols] <- currentDf[,currentCols] # adjust column classes where needed lang <- slot(obj, "lang") tag.class.def <- kRp.POS.tags(lang) for (thisCol in c("tag","wclass","desc")){ if( all( !is.factor(newDf[[thisCol]]), any( !thisCol %in% "desc", !all(is.na(newDf[[thisCol]])) ) ) ){ # make tag a factor with all possible tags for this language as levels newDf[[thisCol]] <- factor( newDf[[thisCol]], levels=unique(tag.class.def[,thisCol]) ) } else {} } newDf <- indexSentenceDoc(newDf, lang=lang, doc_id=doc_id) # fix desc slot if(any(!"doc_id" %in% names(currentDesc), !is.na(doc_id))){ currentDesc[["doc_id"]] <- doc_id } else {} taggedText(obj) <- newDf newDesc <- list(currentDesc) if(!is.na(currentDesc[["doc_id"]])){ names(newDesc) <- currentDesc[["doc_id"]] } describe(obj) <- newDesc return(obj) } ) #' @rdname kRp.text_get-methods #' @docType methods #' @export setGeneric("tif_as_tokens_df", function(tokens) standardGeneric("tif_as_tokens_df")) #' @rdname kRp.text_get-methods #' @param tokens An object of class \code{\link[koRpus:kRp.text-class]{kRp.text}}. #' @export #' @docType methods #' @aliases #' tif_as_tokens_df,-methods #' tif_as_tokens_df,kRp.text-method setMethod("tif_as_tokens_df", signature=signature(tokens="kRp.text"), function(tokens){ result <- taggedText(tokens) # TIF needs doc_id to be a character vector, not a factor result[["doc_id"]] <- as.character(result[["doc_id"]]) return(result) } )