rm(list=ls())
### install some libraries
library(glmnet)
library(stringr)
library(Matrix)
library(tm)
library(wordcloud)
library(SnowballC)
library(RgoogleMaps)
options(stringsAsFactors = F)

### set the directories we need
read.folder <- "/Users/adamnowak/desktop/ARMLS"
save.folder <- "/Users/adamnowak/Dropbox/TextPricing/Data"

### parameters for shading low-to-high
brew.colors <- c('blue','red')
plot.size <- 0.5

### how many possible bigrams are we choosing from ?
tokens.to.use <- 500

### coloring function
BREW <- function(COLORS,x)
{
a<- 0.25
ramp <- colorRamp(COLORS)
COLOR01 <- rank(x,ties.method='random')/length(x)
COLOR <- ramp(COLOR01)
COLOR <- rgb(COLOR,alpha=250*(a+(1-a)*2*abs(COLOR01-0.5)),maxColorValue=255)
return(COLOR)
}

### read in the data
setwd(read.folder)
list.files()[1]
files.to.use <- list.files()
use.percentage <- 0.5



sampler <- function(x)
{
print(x)
raw <- read.csv(file=x,header=T)
raw <- raw[sample(1:nrow(raw),floor(nrow(raw)* use.percentage),replace=F),]

### only houses
raw <- subset(raw , Book.Section=="Single Family - Detached")

### re-scale the latitude and longitude
## you might not need to do this
raw$Geo.Lon <- raw$Geo.Lon/1000000
raw$Geo.Lat <- raw$Geo.Lat/1000000
raw$latitude <- raw$Geo.Lat
raw$longitude <- raw$Geo.Lon

### drop non-sales
raw <- subset(raw , Sold.Price > 0)

### drop missing data
raw <- raw[!is.na(raw$Geo.Lat),]
raw <- raw[!is.na(raw$Geo.Lon),]
raw <- raw[!is.na(raw$ Approx.SQFT),]
raw <- raw[!is.na(raw$Sold.Price),]

### trim the data
q.lat <- quantile(raw$Geo.Lat,c(0.005,0.995))
q.lon <- quantile(raw$Geo.Lon,c(0.005,0.995))
q.sqft <- quantile(raw$Approx.SQFT,c(0.005,0.995))
q.price <- quantile(raw$Sold.Price,c(0.005,0.995))
raw <- subset(raw , Geo.Lat < q.lat[2])
raw <- subset(raw , Geo.Lat > q.lat[1])
raw <- subset(raw , Geo.Lon < q.lon[2])
raw <- subset(raw , Geo.Lon > q.lon[1])
raw <- subset(raw , Approx.SQFT < q.sqft[2])
raw <- subset(raw , Approx.SQFT > q.sqft[1])
raw <- subset(raw , Sold.Price < q.price[2])
raw <- subset(raw , Sold.Price > q.price[1])

### clean the remarks
raw$Public.Remarks <- iconv(raw$Public.Remarks , "latin1" , "ASCII" , sub="")
raw$Public.Remarks <- tolower(raw$Public.Remarks)
raw$Public.Remarks <- gsub("\\. "," STOP ", raw$Public.Remarks)
raw$Public.Remarks <- gsub("[0-9]+","", raw$Public.Remarks)
#raw$Public.Remarks <- gsub(","," ", raw$Public.Remarks)
#raw$Public.Remarks <- gsub("!"," ", raw$Public.Remarks)
#raw$Public.Remarks <- gsub("\\("," ", raw$Public.Remarks)
#raw$Public.Remarks <- gsub("\\)"," ", raw$Public.Remarks)
#raw$Public.Remarks <- gsub("'","", raw$Public.Remarks)
raw$Public.Remarks <- gsub("-","", raw$Public.Remarks)
#raw$Public.Remarks <- gsub("\\*+","", raw$Public.Remarks)
raw$Public.Remarks <- gsub("[^[:alnum:]]"," ", raw$Public.Remarks)
raw$Public.Remarks <- gsub(" +"," ", raw$Public.Remarks)

### square footage bins
raw$Approx.SQFT.100 <- 100*floor(raw$Approx.SQFT/100)
raw$SQFT <- factor(raw$Approx.SQFT.100)

### Year data 
raw$list.year <- as.numeric(substr(raw$List.Date,1,4))
raw$list.month <- as.numeric(substr(raw$List.Date,6,7))
raw$change.year <- as.numeric(substr(raw$Status.Change.Date,1,4))
raw$change.month <- as.numeric(substr(raw$Status.Change.Date,6,7))

### bedrooms
raw$Bedrooms <- raw$X..Bedrooms
raw$Bathrooms <- raw$X..Bathrooms

### clean the beds and baths
raw <- subset(raw, Bathrooms>0)
raw <- subset(raw, Bathrooms<20)
raw <- subset(raw, Bedrooms>0)
raw <- subset(raw, Bedrooms<10)
raw <- subset(raw, Year.Built>=1900)
raw <- subset(raw, Year.Built<2100)
raw <- subset(raw, change.year>=1900)
raw <- subset(raw, change.year <2100)

colnames(raw) <- tolower(colnames(raw))
vars <- c("list.year","list.month","change.year","change.month","list.price","sold.price","city.town.code",
"public.remarks","year.built","bedrooms","bathrooms","zip.code","approx.sqft","longitude","latitude")
raw <- raw[,vars]

### keep complete cases
raw <- raw[complete.cases(raw),]


return(raw)
}


data.set <- do.call('rbind' , lapply(files.to.use, sampler) )
table(data.set$change.year)

setwd(save.folder)
write.csv(data.set,file="MLSsample.csv")

head(data.set)
tail(data.set)



