#' get.text #' #' Extracts main textual content from NISO-JATS coded XML file or text as sectioned text. #' @param x a NISO-JATS coded XML file or text. #' @param sectionsplit search patterns for section split (forced to lower case), e.g. c("intro", "method", "result", "discus"). #' @param grepsection search pattern to reduce text to specific section namings only. #' @param letter.convert Logical. If TRUE converts hexadecimal and HTML coded characters to Unicode. #' @param greek2text Logical. If TRUE some greek letters and special characters will be unified to textual representation (important to extract stats). #' @param sentences Logical. IF TRUE text is returned as sectioned list with sentences. #' @param paragraph Logical. IF TRUE "" is added at the end of each paragraph to enable manual splitting at paragraphs. #' @param cermine Logical. If TRUE CERMINE specific error handling and letter conversion will be applied. If set to "auto" file name ending with 'cermxml$' will set cermine=TRUE. #' @param rm.table Logical. If TRUE removes tag from text. #' @param rm.formula Logical. If TRUE removes tags. #' @param rm.xref Logical. If TRUE removes tag (citing) from text. #' @param rm.media Logical. If TRUE removes tag from text. #' @param rm.graphic Logical. If TRUE removes and tag from text. #' @param rm.ext_link Logical. If TRUE removes tag from text. #' @seealso \code{\link[JATSdecoder]{JATSdecoder}} for simultaneous extraction of meta-tags, abstract, sectioned text and reference list. #' @return List with two elements. 1: Character vector with section title/s, 2: Character vector with floating text of sections or list with vector of sentences per section/s if sentences=TRUE. #' @export get.text<-function(x,sectionsplit="", grepsection="", letter.convert=TRUE, greek2text=FALSE, sentences=FALSE, paragraph=FALSE, cermine="auto", rm.table=TRUE, rm.formula=TRUE, rm.xref=TRUE, rm.media=TRUE, rm.graphic=TRUE, rm.ext_link=TRUE ){ captions<-character(0) # check if x is an object else return standard error message if(length(x)>0) if(is.na(x)[1]) tryCatch(x,error=function(e) message(e)) # set cermine if "auto" if(cermine=="auto") cermine<-ifelse(length(grep("cermxml$",x[1]))>0,TRUE,FALSE) # run prechecks or readLines(x) if x is file x<-preCheck(x) # remove strangely converted lines in CERMXML (mostly from tables) if(cermine==TRUE){ # remove lines with at least "num (num), num (num)" or "num (num) num (num)" x<-grep("[-0-9\\.] \\([-0-9\\.]*\\). [-0-9\\.]* \\([-0-9\\.]*\\)|[-0-9\\.] \\([-0-9\\.]*\\) [-0-9\\.]* \\([-0-9\\.]*\\)",x,invert=TRUE,value=TRUE) i<-NULL # lines with no .,; and < nor word with at least 4 letters i<-c(i,which(nchar(x)<15)[which(is.element(which(nchar(x)<15),grep("[-\\.,,;<)(]|[a-z]{4}|[a-zA-Z]{2}$",x,invert=TRUE)))]) # lines with three dotted numbers in a row i<-c(i,grep("\\.[0-9]*? *?\\.[0-9]*? *?\\.[0-9]*?",x)) i<-c(i,grep("\\.[0-9]*? *?[0-9]\\.[0-9]*? *?[0-9]\\.[0-9]*?",x)) i<-c(i,grep("^[a-z0-9A-Z] [a-z0-9A-Z] [a-z0-9A-Z] ",x)) i<-c(i,grep("^[a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][< ]",x)) i<-c(i,grep("^[a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][< ]",x)) i<-c(i,grep("^[a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z][< ]",x)) i<-c(i,grep("^-[a-z0-9A-Z] [a-z0-9A-Z] [a-z0-9A-Z]",x)) i<-c(i,grep("^-[a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][< ]",x)) i<-c(i,grep("^-[a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][< ]",x)) i<-c(i,grep("^-[a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z] [a-z0-9A-Z][a-z0-9A-Z][< ]",x)) i<-c(i,grep(" [a-z0-9A-Z] [a-z0-9A-Z] [a-z0-9A-Z] ",x)) i<-c(i,grep("[a-z0-9A-Z]{22}",x)) i<-c(i,grep("[,\\.;][a-zA-Z] [a-zA-Z0-9] ",x)) i<-c(i,grep("^.$",x)) i<-c(i,grep(" [;,] [;,] |,[a-zA-z]",x)) i<-c(i,grep("^. .$",x)) i<-c(i,grep("

[a-zA-Z]$",x)) i<-c(i,grep("

[a-zA-Z0-9] [a-zA-Z0-9] [a-zA-Z0-9] ",x)) i<-c(i,grep("

[0-9] [a-zA-Z0-9] [a-zA-Z0-9] |

[0-9][0-9] [a-zA-Z0-9] [a-zA-Z0-9] ",x)) i<-c(i,grep(" - [a-zA-Z] $",x)) i<-c(i,grep(" -[0-9] [a-zA-Z] $",x)) i<-c(i,grep(" [^ ] [^ ] [^ ] [^ ] ",x)) i<-c(i,grep("^[^ ] [^ ] [^ ] [^ ]",x)) i<-c(i,grep("^[A-Za-z] -[^ ] [^ ]",x)) i<-c(i,grep("[a-zA-Z]\\%",x)) i<-sort(unique(i)) # only take such lines that don't have an important tag s<-grep("]||article-type=| p [\\.0-9][0-9\\.]|[\\(\\[][0-9A-Za-z]*?[\\]\\)]",x) i<-i[!is.element(i,s)] # and such that don't have short words s<-grep(" is | be | was | or | and | to | in | at | the | no | not | we | F\\( | t\\(",x) i<-i[!is.element(i,s)] s<-grep(" is^| be^| was^| or^| and^| to^| in^| at^| the^| no^| not^| we^| F \\( | t \\(|^[ \\(\\)\\-]*?$",x) i<-i[!is.element(i,s)] if(length(i>0)) x<-x[-i] } # for text with tag if(length(grep("",x))>0&length(grep("article-type=\"correction\">",x))==0){ # collapse text to one line temp<-paste(x,collapse=" ") # remove text til end of abstract temp<-gsub(".*"))[-1] # grep section names sec<-gsub(".*","",gsub(".*","",txt)) ## section clean up # remove html tags from section titles sec<-gsub("<.*?.*>"," ",sec) sec<-gsub(" "," ",sec) # hex letter conversion if(length(grep("\\&#",sec))>0&letter.convert==T) sec<-letter.convert(sec,cermine=cermine,greek2text=greek2text) # remove numbers in front of line while(length(grep("^[1-9]\\.|^[0-9] ",sec))>0) sec<-gsub("^[1-9]\\.|^[0-9] |^\\.","",sec) # remove brakets in front of line while(length(grep("^\\(.*?.*\\) |\\(.\\) ",sec))>0) sec<-gsub("^\\(.*?.*\\) |\\(.\\) ","",sec) ## text clean up # remove text after begin of reference list or end of body txt<-gsub(" txt<-sub(".*?()|.*?()", "", txt) # letter convert if(letter.convert==TRUE) txt<-letter.convert(txt,cermine=cermine,greek2text=greek2text,warning=FALSE) # unify most usual degree of fredoms noted in "[]" to "()" in statistical reports txt<-gsub("([^a-zA-z][FRrzHQZTt2])\\[([0-9\\. ,;]*?)\\]","\\1(\\2)",txt) txt<-gsub("([^a-zA-z][FRrzHQZTt2]) \\[([0-9\\. ,;]*?)\\]","\\1(\\2)",txt) txt<-gsub("(^[FRrzQHZTt2])\\[([0-9\\. ,;]*?)\\]","\\1(\\2)",txt) txt<-gsub("(^[FRrzQHZTt2]) \\[([0-9\\. ,;]*?)\\]","\\1(\\2)",txt) # clean up white spaces txt<-gsub("^ *|(?<= ) | *$", "", txt, perl = TRUE) ## split text to section split words # for text with sections if(length(sec)>1){ # get index of lines to collapse index<-unique( c(1, grep(paste(tolower(sectionsplit),collapse="|"),tolower(sec)) ,ifelse(is.element(length(sec),grep(paste(tolower(sectionsplit),collapse="|"),tolower(sec))),length(sec)+1,length(sec)+1)) ) section<-NULL;textred<-NULL # collapse lines by "sectionsplit" for(i in 1:(length(index)-1)){ from<-index[i] to<-index[i+1]-1 if(i==length(index)) to<-index[i+1] section<-c(section,paste(sec[from:to],collapse=";; ")) textred<-c(textred,paste(txt[from:to],collapse=" ")) } }else {textred<-txt;section<-sec} # reduce content to lines with grepsection pattern if(grepsection!=""){ textred<-textred[grep(grepsection,section)] section<-section[grep(grepsection,section)] } # extract captions from tables and figures captions<-character(0) if(sum(unlist(grepl("</caption>",textred)))>0){ # split lines with captions temp<-unlist(textred) captions<-unlist(strsplit2(temp,"</caption>","after")) # select lines with </caption> captions<-grep("</caption>",captions,value=TRUE) # clean up captions<-gsub("</caption>","",gsub(".*<caption>|<caption>|.*<caption.*?.*\"/>|<caption.*?.*\"/>|.*<caption.*?.*\">|<caption.*?.*\">","",captions)) # remove html captions<-gsub("</[a-z]*>|</[a-z]>|<[a-z]*>|<[a-z]>|<caption.*?.*\"/>|<xref.*?.*\"/>|<xref.*?.*\">","",captions) } ## Text clean up after sentencing # remove listing definition textred<-lapply(textred,function(x) gsub("<list-item>","-",x)) textred<-lapply(textred,function(x) gsub("</list-item>","",x)) textred<-lapply(textred,function(x) gsub("<list list-type.*?>","",x)) # remove <table> from text #if(rm.table==T) textred<-lapply(textred,function(x) gsub("<tr.*?</tr>","",x)) #if(rm.table==T) textred<-lapply(textred,function(x) gsub("<tr.*?</td>","",x)) #if(rm.table==T) textred<-lapply(textred,function(x) gsub("<tbody>.*?</tbody>","",x)) #if(rm.table==T) textred<-lapply(textred,function(x) gsub("<thead>.*?</thead>","",x)) if(rm.table==T) textred<-lapply(textred,function(x) gsub("<table.*?</table>|<table-wrap .*?</table-wrap>|<table-wrap-foot.*?</table-wrap-foot>","",x)) if(rm.table==T) textred<-lapply(textred,function(x) gsub("<table>|</table>|<table-wrap>|</table-wrap>|<table-wrap-foot>|</table-wrap-foot>","",x)) # remove <xref> from text if(rm.xref==T) textred<-lapply(textred,function(x) gsub("</xref>","",gsub("<xref.*?>","",x))) #if(rm.xref==T) textred<-lapply(textred,function(x) gsub("<xref.*?.*</xref>","",x)) # remove <media> from text if(rm.media==T) textred<-lapply(textred,function(x) gsub("<media.*?.*</media>","",x)) # remove <ext-link> from text if(rm.ext_link==T) textred<-lapply(textred,function(x) gsub("<ext-link.*?>|</ext-link>","",x)) # remove <graphic> from text if(rm.graphic==T){ textred<-lapply(textred,function(x) gsub("<graphic.*?.*</graphic>","removedGRAPHIC",x)) textred<-lapply(textred,function(x) gsub("<fig.*?.*</fig>","removedGRAPHIC",x)) textred<-lapply(textred,function(x) gsub("<object-id.*?.*</object-id>","",x)) textred<-lapply(textred,function(x) gsub("<graphic.*?.*</fig>","removedGRAPHIC",x)) textred<-lapply(textred,function(x) gsub("<fig.*?.*\">","removedGRAPHIC",x)) textred<-lapply(textred,function(x) gsub("<label.*?.*</label>","",x)) textred<-lapply(textred,function(x) gsub("<graphic.*?.*\"/>","removedGRAPHIC",x)) } # remove formula-tags from text if(rm.formula==T){ textred<-lapply(textred,function(x) gsub("<disp .*?.*\"/>|</disp .*?.*>"," ",x)) textred<-lapply(textred,function(x) gsub("<inline-f.*?>|</inline-f.*?>"," ",x)) textred<-lapply(textred,function(x) gsub("<mml.*?>|</mml.*?>"," ",x)) } # remove <caption> from text if some is left textred<-lapply(textred,function(x) gsub("</caption>","",gsub("<caption.*?.*</caption>","",x))) # add "<New paragraph>. if(paragraph==TRUE){ textred<-lapply(textred,function(x) gsub("</p>"," <New paragraph>. ",x)) textred<-lapply(textred,function(x) gsub(" $| <New paragraph>. $","",x)) } # remove <title> and <p> and <sec> <italic> <b> <bold> <undeline> tag textred<-lapply(textred,function(x) gsub("<title>||

|

||||||"," ",x)) textred<-lapply(textred,function(x) gsub("||||||||"," ",x)) textred<-lapply(textred,function(x) gsub("|||"," ",x)) # replace with ^ textred<-lapply(textred,function(x) gsub("","^",x)) # remove paragraph id textred<-lapply(textred,function(x) gsub("

|","",x)) # remove citing breaks [] #textred<-gsub("\\[.*?.*]|\\[\\]","",textred) # clean up ",," and unlist textred<-lapply(textred,function(x) gsub(", ,|, , ,|,,|,,,",",",x)) textred<-lapply(textred,function(x) gsub(", \\.|,\\.",".",x)) textred<-unlist(textred) #textred<-lapply(textred,paste,collapse=" ") # clean up white spaces textred<-gsub("^ *|(?<= ) | *$", "", textred, perl = TRUE) # paste empty (nchar<2) textlines t with t+1, same for section index<-(1:length(textred))[nchar(textred)<=2] if(length(index)>0){ # reverse index index<-index[length(index):1] # remove 1 index<-index[index!=1] if(length(index)>0){ # and paste empty text and section lines for(i in index){ textred[i-1]<-paste(textred[i-1],textred[i],sep=";; ") section[i-1]<-paste(section[i-1],section[i],sep=";; ") } textred<-textred[-index] section<-section[-index] } } # if first textelement is empty paste 1:2 sections and remove first text element if(!is.na(textred[1])) if(nchar(textred[1])<=2){ textred[1]<-paste(textred[1],textred[2],sep=";; ") textred<-textred[-2] section[1]<-paste(section[1],section[2],sep=";; ") section<-section[-2] } # remove double spaces and remaining space/end in front of line textred<-gsub("^ *|(?<= ) | *$", "", textred, perl = TRUE) # hex code to letter if(letter.convert==T) textred<-letter.convert(textred,cermine=cermine,warning=FALSE) # remove space before . and , textred<-gsub(" \\. ",". ",gsub(" , ",", ",textred)) textred<-gsub(" \\. ",". ",gsub(",,",",",textred)) # set ",." to "." textred<-gsub(", \\.|,\\.",".",gsub(" , ",", ",textred)) # remove white space in brackets textred<-gsub(" [)]",")",gsub("[(] ","(",textred)) textred<-gsub(" [\\]]","]",gsub("[\\[] ","[",textred)) # remove white space at beginning and end textred<-gsub("^ | $","",textred) # remove empty brakets textred<-gsub(" \\(,\\)| \\(\\)","",textred) # [ bracket clean up textred<-gsub("\\[\\]|\\[[;,-] \\]|\\[[;,-] [;,-] \\]|\\[\\-\\]| \\[ \\]","",textred) # superscript clean up textred<-gsub(" \\^ | \\^|\\^ ","^",textred) }else{textred<-"";section<-""} if(!exists("section")) section<-"" if(!exists("textred")) textred<-"" section<-gsub(";; ;;",";; ",section) # remove emty sections and text elements if length of section > 1 if(sum(section==""&textred=="")>0){ if(length(section)>1){ i<-which(section==""&textred=="")*-1 textred<-textred[i] section<-section[i] } } textred<-lapply(textred,function(x) gsub(" *. *;;",";;",x)) if(sentences==TRUE) textred<-lapply(textred,text2sentences) if(sentences==TRUE) captions<-lapply(captions,text2sentences) # convert ". " -> and remove it at end of section if(paragraph==T) textred<-lapply(textred,function(x) gsub(". *"," ",x)) if(paragraph==T) textred<-lapply(textred,function(x) gsub("^ ","",x)) if(paragraph==T) textred<-lapply(textred,function(x) gsub(" *$","",x)) # remove if text only contains html tag textred<-lapply(textred,function(x){ out<-x[length(grep("^$|^$",x))==0] if(length(out)==0) out<-"" return(out) }) captions<-lapply(captions,function(x){ out<-x[length(grep("^$|^$",x))==0] if(length(out)==0) out<-"" return(out) }) # output as list list(section=section,text=textred,captions=captions) }