jhudsl
diff --git a/‎01_data_import_pre_geo.Rmd‎
Lines changed: 25 additions & 37 deletions b/‎01_data_import_pre_geo.Rmd‎
Lines changed: 25 additions & 37 deletions
diff --git a/‎02_geo_coding.R‎
Lines changed: 63 additions & 40 deletions b/‎02_geo_coding.R‎
Lines changed: 63 additions & 40 deletions
diff --git a/‎04_prepare_neighborhood_data.Rmd‎
Lines changed: 65 additions & 4 deletions b/‎04_prepare_neighborhood_data.Rmd‎
Lines changed: 65 additions & 4 deletions
@@ -15,73 +15,65 @@ library(janitor)
 filing req code - don't have to file any forms or not 01 = 990, 02 = 990 but less than 25000 income, 03 = group return (don't know what that means), 06 = church, 07 = gov, 00 = not required to file
 
 
-updated date is 2025-04-13 00:10:44 
-
-download date 5/5/2025:
+download date 9/4/2025:
  https://www.irs.gov/downloads/irs-soi -->then search for eo_md.csv
- more about it: https://www.irs.gov/statistics/soi-tax-stats-about-soi
+ more about soi files: https://www.irs.gov/statistics/soi-tax-stats-about-soi
 
 ```{r}
-irs_new_old<- read_csv("New_version_data/eo_md.csv") #older
-irs_new<- read_csv("New_version_data/eo_md_5_25.csv") #newer
+irs_new<- read_csv("data/eo_md.csv") #older
 ```
 
 
 # new epostcard 990-N
 
 Gives us information about who was an active org and helps save some from removal for revocation otherwise not listed.
 link: https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads
-direct link to data:  (last data posting Oct 28, 2024) https://apps.irs.gov/pub/epostcard/data-download-epostcard.zip
+direct link to data:  (last data posting March 10, 2025) https://apps.irs.gov/pub/epostcard/data-download-epostcard.zip
 dictionary: https://www.irs.gov/pub/irs-tege/990n-data-dictionary.pdf
 
-EIN, Tax Year , Organization Name, Gross receipts not greater than, Organization has terminated, Tax Period Begin Date , Tax Period End Date , Website URL , Principal Officer’s Name , Principal Officer’s Address Line 1, Principal Officer’s Address Line 2, Principal Officer’s Address City , Principal Officer’s Address Province, Principal Officer’s Address State , Principal Officer’s Address Zip Code , Principal Officer’s Address Country ,Organization Mailing Address Line 1 , Organization Mailing Address Line 2 , Organization Mailing Address City , Organization Mailing Address Province , Organization Mailing Address State , Organization Mailing Address Zip Code, Organization Mailing Address Country ,Organization Doing Business as Name 1, Organization Doing Business as Name 2 , Organization Doing Business as Name 3 ,
-```{r}
-# specify character for last column
- epost_old <- read_delim("New_version_data/data-download-epostcard.txt", 
-     delim = "|", escape_double = FALSE, col_names = FALSE, 
-     trim_ws = TRUE, col_types = c("X26" = "c"))
+EIN, Tax Year , Organization Name, Gross receipts not greater than, Organization has terminated, Tax Period Begin Date , Tax Period End Date , Website URL , Principal Officer’s Name , Principal Officer’s Address Line 1, Principal Officer’s Address Line 2, Principal Officer’s Address City , Principal Officer’s Address Province, Principal Officer’s Address State , Principal Officer’s Address Zip Code , Principal Officer’s Address Country ,Organization Mailing Address Line 1 , Organization Mailing Address Line 2 , Organization Mailing Address City , Organization Mailing Address Province , Organization Mailing Address State , Organization Mailing Address Zip Code, Organization Mailing Address Country ,Organization Doing Business as Name 1, Organization Doing Business as Name 2 , Organization Doing Business as Name 3
 
-#trying a different way to read in the data with different column lengths
-epost <- read.table("New_version_data/data-download-epostcard.txt",  col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|")
+It was discovered over time that there are also two extra columns and that na values were empty or a single space at times.
 
-epost_new <- read.table("New_version_data/data-download-epostcard_5_8_25.txt",  col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|")
+```{r}
+#reading in the data with different column lengths
+epost <- read.table("data/data-download-epostcard.txt",  col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|", quote = "", na.strings = c("" , " ", NA ))
 ```
 
 ### check for problems
 ```{r, include = FALSE}
-prbs <- problems(epost)
-dim(prbs)
+nrow(problems(epost))
 ```
 
-<!-- ```{bash, include = FALSE} -->
-<!-- # checking the original version of the file for one of the problematic rows -->
-<!-- <!-- awk 'FNR>=9294 && FNR<=9294' New_version_data/data-download-epostcard_2024.txt  # first problematic row is actually originally on line 9294 --> -->
-
-<!-- <!-- awk 'FNR>=358951 && FNR<=358951' New_version_data/data-download-epostcard_2024.txt  #second problematic row --> -->
-
-<!-- <!-- awk 'FNR>=1 && FNR<=3' New_version_data/data-download-epostcard_2024.txt # checking line 1-3 to ensure that there were some empty lines for the first two lines - yup looks like it! --> -->
-<!-- ``` -->
 
 ### Get row names:
 ```{r}
-colnames(epost) <- c("EIN", "Tax Year" , "Organization Name", "Gross receipts not greater than", "Organization has terminated", "Tax Period Begin Date" , "Tax Period End Date" , "Website URL" , "Principal Officer’s Name" , "Principal Officer’s Address Line 1", "Principal Officer’s Address Line 2", "Principal Officer’s Address City" , "Principal Officer’s Address Province", "Principal Officer’s Address State" , "Principal Officer’s Address Zip Code" , "Principal Officer’s Address Country" ,"Organization Mailing Address Line 1" , "Organization Mailing Address Line 2" , "Organization Mailing Address City" , "Organization Mailing Address Province" , "Organization Mailing Address State" , "Organization Mailing Address Zip Code", "Organization Mailing Address Country" ,"Organization Doing Business as Name 1", "Organization Doing Business as Name 2", "Organization Doing Business as Name 3")
+colnames(epost) <- c("EIN", "Tax Year" , "Organization Name", "Gross receipts not greater than", "Organization has terminated", "Tax Period Begin Date" , "Tax Period End Date" , "Website URL" , "Principal Officer’s Name" , "Principal Officer’s Address Line 1", "Principal Officer’s Address Line 2", "Principal Officer’s Address City" , "Principal Officer’s Address Province", "Principal Officer’s Address State" , "Principal Officer’s Address Zip Code" , "Principal Officer’s Address Country" ,"Organization Mailing Address Line 1" , "Organization Mailing Address Line 2" , "Organization Mailing Address City" , "Organization Mailing Address Province" , "Organization Mailing Address State" , "Organization Mailing Address Zip Code", "Organization Mailing Address Country" ,"Organization Doing Business as Name 1", "Organization Doing Business as Name 2", "Organization Doing Business as Name 3", "extra_1", "extra_2")
 
 
 epost <-clean_names(epost)# removing spaces from names
 
-# checking the rows that have extra columns - only one for MD but will keep it in the dataset
-epost_extra_long <- epost %>% filter(na !="") %>% filter(organization_mailing_address_state == "MD")
-epost_extra_long2 <- epost %>% filter(na_2 !="") %>% filter(organization_mailing_address_state == "MD")
+```
 
+```{r}
+# checking the rows that have extra columns - only one for MD but will keep it in the dataset
+epost %>% drop_na(extra_1) %>% nrow() # look at how many are not empty values
+epost %>% drop_na(extra_2) %>% nrow() # look at how many are not empty values
+#Looks like all the extra values are na
+#removing empty columns
+epost <- remove_empty(epost, "cols")
 ```
 
 
+
+
 ### How many in epost not in irs_new? filter for md
 
 ```{r}
 irs_new<- rename_with(irs_new, tolower) #make names lowercase 
 epost <- epost %>% filter(principal_officer_s_address_state == "MD") #filter for only MD
 
+epost <-epost %>% mutate(ein = as.character(ein))
 epost_only <-anti_join(epost,irs_new, by ="ein") 
 nrow(epost_only) # not in irs_new
 nrow(irs_new)
@@ -107,9 +99,8 @@ nrow(IRS) == nrow(irs_new)# test if dimensions are still the same
 
 https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads --> click on automatic revocation or exemption list link
 
-Direct link downloaded 5_8_25 (this dataset will change overtime): https://apps.irs.gov/pub/epostcard/data-download-revocation.zip
+Direct link downloaded 9/4/2025 (this dataset will change overtime): https://apps.irs.gov/pub/epostcard/data-download-revocation.zip
 
-last updated Dec,9, 2024
 
 Under Internal Revenue Code Section 6033(j)(1)(A), the IRS will revoke the tax-exempt status of nonprofit organizations automatically when organizations with a filing requirement do not file their required Forms 990 for three consecutive years. When such a revocation occurs, it is effective as of the filing deadline, which is typically May 15 for organizations filing on a calendar year basis. This year, however, the Form 990 filing deadline was delayed to July 15 due to the COVID-19 pandemic
 
@@ -120,10 +111,7 @@ more info: https://www.irs.gov/pub/irs-pdf/p4991.pdf
 
 
 ```{r}
-revocations <- read_delim("New_version_data/data-download-revocation_3.txt",
-    delim = "|", escape_double = FALSE, col_names = FALSE,
-    trim_ws = TRUE)
-revocations_new <- read_delim("New_version_data/data-download-revocation_5_8_25.txt",
+revocations <- read_delim("data/data-download-revocation.txt",
     delim = "|", escape_double = FALSE, col_names = FALSE,
     trim_ws = TRUE)
 head(revocations)
@@ -136,7 +124,7 @@ IRS_with_rev <-left_join(IRS, revocations, suffix = c("irs", "rev"), by = "ein")
 ```{r}
 library(ggmap)
 IRS <- IRS_with_rev %>% unite("address",street:zipirs, remove = FALSE, sep = ", ") # this next step is slow so commenting out, making address variable
-saveRDS(IRS, file = "New_version_data/IRS_data_before_lat_long.rds")
+saveRDS(IRS, file = "data/IRS_data_before_lat_long.rds")
 ```
 
 
@@ -1,7 +1,26 @@
 library(tidyverse)
 library(tidygeocoder)
 
-IRS <- readRDS(file = "New_version_data/IRS_data_before_lat_long.rds")
+IRS <- readRDS(file = "data/IRS_data_before_lat_long.rds")
+IRS <- head(IRS, n = 7)
+# Function to geocode a subset of data
+geocode_subset <- function(data_subset) {
+  data_subset %>% geocode(address = address,
+        method = 'arcgis', lat = latitude , long = longitude)
+}
+
+# Split the data into chunks
+num_cores <- detectCores() - 1
+data_chunks <- split(IRS, rep(1:num_cores, length.out = nrow(IRS)))
+
+# Perform parallel geocoding
+results <- mclapply(data_chunks, geocode_subset, mc.cores = num_cores)
+
+# Combine results
+final_results <- bind_rows(results)
+
+# Save the results to a CSV file
+saveRDS(final_results, file = "data/geos_new.rds")
 
 ### example code ###########
 # some_addresses <- tibble::tribble(
@@ -16,8 +35,8 @@ IRS <- readRDS(file = "New_version_data/IRS_data_before_lat_long.rds")
 # lat_longs
 ################
 
-split_vect<- rep(1:10, length.out = nrow(IRS)) # vector to split by
-split_IRS <-IRS %>% split(split_vect)
+#split_vect<- rep(1:10, length.out = nrow(IRS)) # vector to split by
+#split_IRS <-IRS %>% split(split_vect)
 
 # geo_IRS_1<- geocode(split_IRS[[1]], address = address,
 #                           method = 'arcgis', lat = latitude , long = longitude)
@@ -42,43 +61,43 @@ split_IRS <-IRS %>% split(split_vect)
 
 
 
-IRSfirst3rd <- IRS[1:15000,]
-IRSsecond3rd <- IRS[15001:30000,]
-IRSthird3rd <- IRS[30001:(length(IRS$ein)),]
-
-geosfirst3rd <-IRSfirst3rd  %>% geocode(address = address,
-              method = 'arcgis', lat = latitude , long = longitude)
-
-saveRDS(geosfirst3rd, file = "New_version_data/first_geos.rds")
-
-IRSsecond3rd_A <-IRSsecond3rd[1:8000,]
-
-IRSsecond3rd_B <-IRSsecond3rd[8001:length(IRSfirst3rd$ein),]
-beep(1, geossecond3rd_A<-IRSsecond3rd_A  %>% geocode(address = address,
-                 method = 'arcgis', lat = latitude , long = longitude))
-
-beep(1, geossecond3rd_B<-IRSsecond3rd_B  %>% geocode(address = address,
-                                             method = 'arcgis', lat = latitude , long = longitude))
-
-saveRDS(geossecond3rd_A, file = "New_version_data/second_geosA.rds")
-saveRDS(geossecond3rd_B, file = "New_version_data/second_geosB.rds")
-
-beep(1, geoslast3rd<-IRSthird3rd  %>% geocode(address = address,
-                                         method = 'arcgis', lat = latitude , long = longitude))
-
-saveRDS(geoslast3rd, file = "New_version_data/third_geos.rds")
-
-geos <- rbind(first_geos, geossecond3rd_A, geossecond3rd_B, geoslast3rd)
-
-saveRDS(geos, file = "New_version_data/geos.rds")
-
-
-#library(naniar)
-#geos %>%pull(latitude) %>% pct_complete() #~100 %
-
-
-
-
+# IRSfirst3rd <- IRS[1:15000,]
+# IRSsecond3rd <- IRS[15001:30000,]
+# IRSthird3rd <- IRS[30001:(length(IRS$ein)),]
+#
+# geosfirst3rd <-IRSfirst3rd  %>% geocode(address = address,
+#               method = 'arcgis', lat = latitude , long = longitude)
+#
+# saveRDS(geosfirst3rd, file = "data/first_geos.rds")
+#
+# IRSsecond3rd_A <-IRSsecond3rd[1:8000,]
+#
+# IRSsecond3rd_B <-IRSsecond3rd[8001:length(IRSfirst3rd$ein),]
+# beep(1, geossecond3rd_A<-IRSsecond3rd_A  %>% geocode(address = address,
+#                  method = 'arcgis', lat = latitude , long = longitude))
+#
+# beep(1, geossecond3rd_B<-IRSsecond3rd_B  %>% geocode(address = address,
+#                                              method = 'arcgis', lat = latitude , long = longitude))
+#
+# saveRDS(geossecond3rd_A, file = "data/second_geosA.rds")
+# saveRDS(geossecond3rd_B, file = "data/second_geosB.rds")
+#
+# beep(1, geoslast3rd<-IRSthird3rd  %>% geocode(address = address,
+#                                          method = 'arcgis', lat = latitude , long = longitude))
+#
+# saveRDS(geoslast3rd, file = "data/third_geos.rds")
+#
+# geos <- rbind(first_geos, geossecond3rd_A, geossecond3rd_B, geoslast3rd)
+#
+# saveRDS(geos, file = "data/geos.rds")
+#
+#
+# #library(naniar)
+# #geos %>%pull(latitude) %>% pct_complete() #~100 %
+#
+#
+#
+#
 
 #' To cite tidygeocoder use:
 #'
@@ -102,3 +121,7 @@ saveRDS(geos, file = "New_version_data/geos.rds")
 #'   pages = {3544},
 #'   note = {R package version 1.0.5},
 #' }
+#'
+#'
+
+
@@ -9,8 +9,69 @@ library(naniar)
 
 # Data Import
 ```{r}
-df_simplified<-read_excel(here::here("Nonprofit_Baltimore_Analysis.xlsx"), sheet = 3)
-BMF <- read_csv("New_version_data/MD_BMF_V1.1.csv")
-neighborhoods <- read_csv("New_version_data/Neighborhood_Statistical_Area_(NSA)_Boundaries.csv")
-neighborhoods2 <- read_csv("New_version_data/Neighborhoods(in)_from_tyler_April4_2025.csv") # hmm these are kinda different
+
+#IRS <- read_rds("data/IRS_data_before_lat_long.rds") # 41,638 rows from the eo_md.csv
+#any(is.na(IRS$cityirs)) # no na values for city
+#IRS <- filter(IRS, is.na(cityirs) | cityirs == "BALTIMORE") #filter for Baltimore for now to keep small as we test
+#df_simplified<-read_excel(here::here("Nonprofit_Baltimore_Analysis.xlsx"), sheet = 3)
+BMF <- read_csv("data/MD_BMF_V1.1.csv")# much bigger than raw IRS data 73,768 rows (also comes from IRS)
+neighborhoods <- read_csv("data/Neighborhood_Statistical_Area_(NSA)_Boundaries.csv")
+```
+
+## Get lat and long for shape file
+
+https://stackoverflow.com/questions/66381795/check-whether-point-coordinate-lies-within-polygon
+https://www.statsilk.com/maps/convert-esri-shapefile-map-geojson-format
+
+```{r}
+library(raster)
+library(sf)
+
+neighborhood_shape <-st_read("data/Neighborhood_Statistical_Area_(NSA)_Boundaries/Neighborhood_Statistical_Area_(NSA)_Boundaries.shp")
+
+```
+
+
+```{r}
+BMF_geo <- BMF %>% dplyr::select(EIN, LATITUDE, LONGITUDE)
+any(is.na(BMF_geo$LATITUDE)) # no missing location info
+any(is.na(BMF_geo$LONGITUDE))
+```
+
+```{r}
+CRS <- st_crs(neighborhood_shape$geometry)
+pnts_sf <- st_as_sf(BMF, coords = c('LONGITUDE', 'LATITUDE'), crs = st_crs(4326)) %>% st_set_crs(4326)
+#neighborhood_Sf <-neighborhood_shape$geometry
+#neighborhood_Sf <- neighborhood_Sf %>% st_set_crs(4326)
+pnts_trans <- st_transform(pnts_sf, 2163)
+neighborhood_tt <- st_transform(neighborhood_shape$geometry, 2163)
+intersection <- pnts_sf %>% mutate(
+  intersection = as.integer(st_intersects( pnts_trans, neighborhood_tt )))
+in_balt <- intersection %>% filter(!is.na(intersection)) # just baltimore locations
+```
+
+
+
+checking that it worked
+```{r}
+# first row:
+#            ein                   geometry intersection
+# 1  010591773 POINT (-76.69024 39.36632)           43
+
+#neighborhood_shape[43,]$name
+#filter(IRS, ein == "010591773")
+
+#Looks like this is in that location
+
+```
+
+
+Combining it all together:
+```{r}
+neighborhood_shape <-as_tibble(neighborhood_shape)
+neighborhood_shape<-neighborhood_shape %>%  mutate(id = row_number())
+org_data <-left_join(in_balt, neighborhood_shape, by = c("intersection" = "id"))
+
+write_rds(org_data, file = "data/processed/org_data.rds")
+
 ```