Skip to content

Commit 070aa53

Browse files
Merge pull request #12 from jhudsl/cleaning
Cleaning
2 parents 17d67b0 + e05c0c9 commit 070aa53

File tree

8 files changed

+757
-1
lines changed

8 files changed

+757
-1
lines changed

01_data_import_pre_geo.Rmd

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
2+
3+
4+
# Import IRS Data
5+
```{r}
6+
library(readxl)
7+
library(here)
8+
library(tidyverse)
9+
library(janitor)
10+
```
11+
12+
## New eo_md
13+
14+
990 or not is PF filing req code
15+
filing req code - don't have to file any forms or not 01 = 990, 02 = 990 but less than 25000 income, 03 = group return (don't know what that means), 06 = church, 07 = gov, 00 = not required to file
16+
17+
18+
updated date is 2025-04-13 00:10:44
19+
20+
download date 5/5/2025:
21+
https://www.irs.gov/downloads/irs-soi -->then search for eo_md.csv
22+
more about it: https://www.irs.gov/statistics/soi-tax-stats-about-soi
23+
24+
```{r}
25+
irs_new_old<- read_csv("New_version_data/eo_md.csv") #older
26+
irs_new<- read_csv("New_version_data/eo_md_5_25.csv") #newer
27+
```
28+
29+
30+
# new epostcard 990-N
31+
32+
Gives us information about who was an active org and helps save some from removal for revocation otherwise not listed.
33+
link: https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads
34+
direct link to data: (last data posting Oct 28, 2024) https://apps.irs.gov/pub/epostcard/data-download-epostcard.zip
35+
dictionary: https://www.irs.gov/pub/irs-tege/990n-data-dictionary.pdf
36+
37+
EIN, Tax Year , Organization Name, Gross receipts not greater than, Organization has terminated, Tax Period Begin Date , Tax Period End Date , Website URL , Principal Officer’s Name , Principal Officer’s Address Line 1, Principal Officer’s Address Line 2, Principal Officer’s Address City , Principal Officer’s Address Province, Principal Officer’s Address State , Principal Officer’s Address Zip Code , Principal Officer’s Address Country ,Organization Mailing Address Line 1 , Organization Mailing Address Line 2 , Organization Mailing Address City , Organization Mailing Address Province , Organization Mailing Address State , Organization Mailing Address Zip Code, Organization Mailing Address Country ,Organization Doing Business as Name 1, Organization Doing Business as Name 2 , Organization Doing Business as Name 3 ,
38+
```{r}
39+
# specify character for last column
40+
epost_old <- read_delim("New_version_data/data-download-epostcard.txt",
41+
delim = "|", escape_double = FALSE, col_names = FALSE,
42+
trim_ws = TRUE, col_types = c("X26" = "c"))
43+
44+
#trying a different way to read in the data with different column lengths
45+
epost <- read.table("New_version_data/data-download-epostcard.txt", col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|")
46+
47+
epost_new <- read.table("New_version_data/data-download-epostcard_5_8_25.txt", col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|")
48+
```
49+
50+
### check for problems
51+
```{r, include = FALSE}
52+
prbs <- problems(epost)
53+
dim(prbs)
54+
```
55+
56+
<!-- ```{bash, include = FALSE} -->
57+
<!-- # checking the original version of the file for one of the problematic rows -->
58+
<!-- <!-- awk 'FNR>=9294 && FNR<=9294' New_version_data/data-download-epostcard_2024.txt # first problematic row is actually originally on line 9294 --> -->
59+
60+
<!-- <!-- awk 'FNR>=358951 && FNR<=358951' New_version_data/data-download-epostcard_2024.txt #second problematic row --> -->
61+
62+
<!-- <!-- awk 'FNR>=1 && FNR<=3' New_version_data/data-download-epostcard_2024.txt # checking line 1-3 to ensure that there were some empty lines for the first two lines - yup looks like it! --> -->
63+
<!-- ``` -->
64+
65+
### Get row names:
66+
```{r}
67+
colnames(epost) <- c("EIN", "Tax Year" , "Organization Name", "Gross receipts not greater than", "Organization has terminated", "Tax Period Begin Date" , "Tax Period End Date" , "Website URL" , "Principal Officer’s Name" , "Principal Officer’s Address Line 1", "Principal Officer’s Address Line 2", "Principal Officer’s Address City" , "Principal Officer’s Address Province", "Principal Officer’s Address State" , "Principal Officer’s Address Zip Code" , "Principal Officer’s Address Country" ,"Organization Mailing Address Line 1" , "Organization Mailing Address Line 2" , "Organization Mailing Address City" , "Organization Mailing Address Province" , "Organization Mailing Address State" , "Organization Mailing Address Zip Code", "Organization Mailing Address Country" ,"Organization Doing Business as Name 1", "Organization Doing Business as Name 2", "Organization Doing Business as Name 3")
68+
69+
70+
epost <-clean_names(epost)# removing spaces from names
71+
72+
# checking the rows that have extra columns - only one for MD but will keep it in the dataset
73+
epost_extra_long <- epost %>% filter(na !="") %>% filter(organization_mailing_address_state == "MD")
74+
epost_extra_long2 <- epost %>% filter(na_2 !="") %>% filter(organization_mailing_address_state == "MD")
75+
76+
```
77+
78+
79+
### How many in epost not in irs_new? filter for md
80+
81+
```{r}
82+
irs_new<- rename_with(irs_new, tolower) #make names lowercase
83+
epost <- epost %>% filter(principal_officer_s_address_state == "MD") #filter for only MD
84+
85+
epost_only <-anti_join(epost,irs_new, by ="ein")
86+
nrow(epost_only) # not in irs_new
87+
nrow(irs_new)
88+
nrow(epost)
89+
```
90+
91+
92+
## Combine IRS data
93+
94+
```{r}
95+
96+
irs_epost <-full_join(epost, irs_new, by = "ein")
97+
98+
IRS <-left_join(irs_new, epost, by = "ein") # keeps all rows of IRS and adds info from epost where possible as new columns
99+
nrow(IRS) == nrow(irs_new)# test if dimensions are still the same
100+
```
101+
102+
103+
104+
105+
106+
## Revocations
107+
108+
https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads --> click on automatic revocation or exemption list link
109+
110+
Direct link downloaded 5_8_25 (this dataset will change overtime): https://apps.irs.gov/pub/epostcard/data-download-revocation.zip
111+
112+
last updated Dec,9, 2024
113+
114+
Under Internal Revenue Code Section 6033(j)(1)(A), the IRS will revoke the tax-exempt status of nonprofit organizations automatically when organizations with a filing requirement do not file their required Forms 990 for three consecutive years. When such a revocation occurs, it is effective as of the filing deadline, which is typically May 15 for organizations filing on a calendar year basis. This year, however, the Form 990 filing deadline was delayed to July 15 due to the COVID-19 pandemic
115+
116+
Organizations that do not file a required annual information return or notice for three consecutive years automatically lose their tax-exempt status by operation of law. An automatic revocation is effective on the original filing due date of the third annual return or notice (the "revocation date"). Due to the COVID-19 emergency, this year the IRS extended the filing dates for these returns and notices due from April 1 through July 14 to July 15, 2020. Organizations eligible for this relief that failed to file for the two previous years and did not file by July 15 have automatically lost their tax-exempt status. Due to systemic limitations, these organizations appear on the auto-revocation list showing a revocation date between April 1 and July 14, 2020. However, the revocation date for these organizations is July 15, 2020. For more information on automatic revocation, including how to request reinstatement, see Automatic revocation - How to have your tax-exempt status reinstated.
117+
118+
more info: https://www.irs.gov/pub/irs-pdf/p4991.pdf
119+
120+
121+
122+
```{r}
123+
revocations <- read_delim("New_version_data/data-download-revocation_3.txt",
124+
delim = "|", escape_double = FALSE, col_names = FALSE,
125+
trim_ws = TRUE)
126+
revocations_new <- read_delim("New_version_data/data-download-revocation_5_8_25.txt",
127+
delim = "|", escape_double = FALSE, col_names = FALSE,
128+
trim_ws = TRUE)
129+
head(revocations)
130+
colnames(revocations) <- c("ein", "rev_org_name", "address1", "address2", "city", "state", "zip", "country", "some_rev_code", "rev_date1", "rev_date2", "not_sure")
131+
IRS_with_rev <-left_join(IRS, revocations, suffix = c("irs", "rev"), by = "ein")
132+
```
133+
134+
# Prepare for Geocoding
135+
136+
```{r}
137+
library(ggmap)
138+
IRS <- IRS_with_rev %>% unite("address",street:zipirs, remove = FALSE, sep = ", ") # this next step is slow so commenting out, making address variable
139+
saveRDS(IRS, file = "New_version_data/IRS_data_before_lat_long.rds")
140+
```
141+
142+

02_geo_coding.R

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
library(tidyverse)
2+
library(tidygeocoder)
3+
4+
IRS <- readRDS(file = "New_version_data/IRS_data_before_lat_long.rds")
5+
6+
### example code ###########
7+
# some_addresses <- tibble::tribble(
8+
# ~name, ~addr,
9+
# "White House", "1600 pennsylvania ave nw, washington, dc",
10+
# "Transamerica Pyramid", "600 Montgomery St, San Francisco, CA, 94111",
11+
# "Willis Tower", "233 S WACKER DR, CHICAGO, IL, 60606-6300"
12+
# )
13+
#
14+
# lat_longs <- some_addresses %>%
15+
# geocode(addr, method = 'osm', lat = latitude , long = longitude)
16+
# lat_longs
17+
################
18+
19+
split_vect<- rep(1:10, length.out = nrow(IRS)) # vector to split by
20+
split_IRS <-IRS %>% split(split_vect)
21+
22+
# geo_IRS_1<- geocode(split_IRS[[1]], address = address,
23+
# method = 'arcgis', lat = latitude , long = longitude)
24+
# geo_IRS_2<- geocode(split_IRS[[2]], address = address,
25+
# method = 'arcgis', lat = latitude , long = longitude)
26+
# geo_IRS_3<- geocode(split_IRS[[3]], address = address,
27+
# method = 'arcgis', lat = latitude , long = longitude)
28+
# geo_IRS_4<- geocode(split_IRS[[4]], address = address,
29+
# method = 'arcgis', lat = latitude , long = longitude)
30+
# geo_IRS_5<- geocode(split_IRS[[5]], address = address,
31+
# method = 'arcgis', lat = latitude , long = longitude)
32+
# geo_IRS_6<- geocode(split_IRS[[6]], address = address,
33+
# method = 'arcgis', lat = latitude , long = longitude)
34+
# geo_IRS_7<- geocode(split_IRS[[7]], address = address,
35+
# method = 'arcgis', lat = latitude , long = longitude)
36+
# geo_IRS_8<- geocode(split_IRS[[8]], address = address,
37+
# method = 'arcgis', lat = latitude , long = longitude)
38+
# geo_IRS_9<- geocode(split_IRS[[9]], address = address,
39+
# method = 'arcgis', lat = latitude , long = longitude)
40+
# geo_IRS_10<- geocode(split_IRS[[10]], address = address,
41+
# method = 'arcgis', lat = latitude , long = longitude)
42+
43+
44+
45+
IRSfirst3rd <- IRS[1:15000,]
46+
IRSsecond3rd <- IRS[15001:30000,]
47+
IRSthird3rd <- IRS[30001:(length(IRS$ein)),]
48+
49+
geosfirst3rd <-IRSfirst3rd %>% geocode(address = address,
50+
method = 'arcgis', lat = latitude , long = longitude)
51+
52+
saveRDS(geosfirst3rd, file = "New_version_data/first_geos.rds")
53+
54+
IRSsecond3rd_A <-IRSsecond3rd[1:8000,]
55+
56+
IRSsecond3rd_B <-IRSsecond3rd[8001:length(IRSfirst3rd$ein),]
57+
beep(1, geossecond3rd_A<-IRSsecond3rd_A %>% geocode(address = address,
58+
method = 'arcgis', lat = latitude , long = longitude))
59+
60+
beep(1, geossecond3rd_B<-IRSsecond3rd_B %>% geocode(address = address,
61+
method = 'arcgis', lat = latitude , long = longitude))
62+
63+
saveRDS(geossecond3rd_A, file = "New_version_data/second_geosA.rds")
64+
saveRDS(geossecond3rd_B, file = "New_version_data/second_geosB.rds")
65+
66+
beep(1, geoslast3rd<-IRSthird3rd %>% geocode(address = address,
67+
method = 'arcgis', lat = latitude , long = longitude))
68+
69+
saveRDS(geoslast3rd, file = "New_version_data/third_geos.rds")
70+
71+
geos <- rbind(first_geos, geossecond3rd_A, geossecond3rd_B, geoslast3rd)
72+
73+
saveRDS(geos, file = "New_version_data/geos.rds")
74+
75+
76+
#library(naniar)
77+
#geos %>%pull(latitude) %>% pct_complete() #~100 %
78+
79+
80+
81+
82+
83+
#' To cite tidygeocoder use:
84+
#'
85+
#' Cambon J, Hernangómez D, Belanger C, Possenriede D (2021).
86+
#' tidygeocoder: An R package for geocoding. Journal of Open Source
87+
#' Software, 6(65), 3544, https://doi.org/10.21105/joss.03544 (R package
88+
#' version 1.0.5)
89+
#'
90+
#' A BibTeX entry for LaTeX users is
91+
#'
92+
#' @Article{,
93+
#' title = {tidygeocoder: An R package for geocoding},
94+
#' author = {Jesse Cambon and Diego Hernangómez and Christopher Belanger and Daniel Possenriede},
95+
#' year = {2021},
96+
#' journal = {Journal of Open Source Software},
97+
#' publisher = {The Open Journal},
98+
#' doi = {10.21105/joss.03544},
99+
#' url = {https://doi.org/10.21105/joss.03544},
100+
#' volume = {6},
101+
#' number = {65},
102+
#' pages = {3544},
103+
#' note = {R package version 1.0.5},
104+
#' }

0 commit comments

Comments
 (0)