library(stringr) #Load your data into R RawData <- read.table("DB_YS.txt", sep="\n", header = F) #Extract the dates into a new vector called dates dates <- str_extract(RawData$V1, "[A-Za-z]+ \\d+, \\d{4} *$") #Extract the urls assuming that all urls will start by http and store them in a new vector called url url <- str_extract(RawData$V1, "http.+") #Remove the urls from text and store them into a vector called text text <- gsub("http.+", "", RawData$V1) #Remove the "indyref" that tells twitter where to put the urls in a tweet and overwrite the result in the text vector text <- gsub(" indyref", "", text) #Create a data.frame containing the tidy data Data <- data.frame(dates, text, url, stringsAsFactors=F)