|
| 1 | + |
| 2 | + |
| 3 | + |
| 4 | +# Import IRS Data |
| 5 | +```{r} |
| 6 | +library(readxl) |
| 7 | +library(here) |
| 8 | +library(tidyverse) |
| 9 | +library(janitor) |
| 10 | +``` |
| 11 | + |
| 12 | +## New eo_md |
| 13 | + |
| 14 | +990 or not is PF filing req code |
| 15 | +filing req code - don't have to file any forms or not 01 = 990, 02 = 990 but less than 25000 income, 03 = group return (don't know what that means), 06 = church, 07 = gov, 00 = not required to file |
| 16 | + |
| 17 | + |
| 18 | +updated date is 2025-04-13 00:10:44 |
| 19 | + |
| 20 | +download date 5/5/2025: |
| 21 | + https://www.irs.gov/downloads/irs-soi -->then search for eo_md.csv |
| 22 | + more about it: https://www.irs.gov/statistics/soi-tax-stats-about-soi |
| 23 | + |
| 24 | +```{r} |
| 25 | +irs_new_old<- read_csv("New_version_data/eo_md.csv") #older |
| 26 | +irs_new<- read_csv("New_version_data/eo_md_5_25.csv") #newer |
| 27 | +``` |
| 28 | + |
| 29 | + |
| 30 | +# new epostcard 990-N |
| 31 | + |
| 32 | +Gives us information about who was an active org and helps save some from removal for revocation otherwise not listed. |
| 33 | +link: https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads |
| 34 | +direct link to data: (last data posting Oct 28, 2024) https://apps.irs.gov/pub/epostcard/data-download-epostcard.zip |
| 35 | +dictionary: https://www.irs.gov/pub/irs-tege/990n-data-dictionary.pdf |
| 36 | + |
| 37 | +EIN, Tax Year , Organization Name, Gross receipts not greater than, Organization has terminated, Tax Period Begin Date , Tax Period End Date , Website URL , Principal Officer’s Name , Principal Officer’s Address Line 1, Principal Officer’s Address Line 2, Principal Officer’s Address City , Principal Officer’s Address Province, Principal Officer’s Address State , Principal Officer’s Address Zip Code , Principal Officer’s Address Country ,Organization Mailing Address Line 1 , Organization Mailing Address Line 2 , Organization Mailing Address City , Organization Mailing Address Province , Organization Mailing Address State , Organization Mailing Address Zip Code, Organization Mailing Address Country ,Organization Doing Business as Name 1, Organization Doing Business as Name 2 , Organization Doing Business as Name 3 , |
| 38 | +```{r} |
| 39 | +# specify character for last column |
| 40 | + epost_old <- read_delim("New_version_data/data-download-epostcard.txt", |
| 41 | + delim = "|", escape_double = FALSE, col_names = FALSE, |
| 42 | + trim_ws = TRUE, col_types = c("X26" = "c")) |
| 43 | +
|
| 44 | +#trying a different way to read in the data with different column lengths |
| 45 | +epost <- read.table("New_version_data/data-download-epostcard.txt", col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|") |
| 46 | +
|
| 47 | +epost_new <- read.table("New_version_data/data-download-epostcard_5_8_25.txt", col.names = paste0("V", seq_len(28)), fill = TRUE, header=FALSE, sep="|") |
| 48 | +``` |
| 49 | + |
| 50 | +### check for problems |
| 51 | +```{r, include = FALSE} |
| 52 | +prbs <- problems(epost) |
| 53 | +dim(prbs) |
| 54 | +``` |
| 55 | + |
| 56 | +<!-- ```{bash, include = FALSE} --> |
| 57 | +<!-- # checking the original version of the file for one of the problematic rows --> |
| 58 | +<!-- <!-- awk 'FNR>=9294 && FNR<=9294' New_version_data/data-download-epostcard_2024.txt # first problematic row is actually originally on line 9294 --> --> |
| 59 | + |
| 60 | +<!-- <!-- awk 'FNR>=358951 && FNR<=358951' New_version_data/data-download-epostcard_2024.txt #second problematic row --> --> |
| 61 | + |
| 62 | +<!-- <!-- awk 'FNR>=1 && FNR<=3' New_version_data/data-download-epostcard_2024.txt # checking line 1-3 to ensure that there were some empty lines for the first two lines - yup looks like it! --> --> |
| 63 | +<!-- ``` --> |
| 64 | + |
| 65 | +### Get row names: |
| 66 | +```{r} |
| 67 | +colnames(epost) <- c("EIN", "Tax Year" , "Organization Name", "Gross receipts not greater than", "Organization has terminated", "Tax Period Begin Date" , "Tax Period End Date" , "Website URL" , "Principal Officer’s Name" , "Principal Officer’s Address Line 1", "Principal Officer’s Address Line 2", "Principal Officer’s Address City" , "Principal Officer’s Address Province", "Principal Officer’s Address State" , "Principal Officer’s Address Zip Code" , "Principal Officer’s Address Country" ,"Organization Mailing Address Line 1" , "Organization Mailing Address Line 2" , "Organization Mailing Address City" , "Organization Mailing Address Province" , "Organization Mailing Address State" , "Organization Mailing Address Zip Code", "Organization Mailing Address Country" ,"Organization Doing Business as Name 1", "Organization Doing Business as Name 2", "Organization Doing Business as Name 3") |
| 68 | +
|
| 69 | +
|
| 70 | +epost <-clean_names(epost)# removing spaces from names |
| 71 | +
|
| 72 | +# checking the rows that have extra columns - only one for MD but will keep it in the dataset |
| 73 | +epost_extra_long <- epost %>% filter(na !="") %>% filter(organization_mailing_address_state == "MD") |
| 74 | +epost_extra_long2 <- epost %>% filter(na_2 !="") %>% filter(organization_mailing_address_state == "MD") |
| 75 | +
|
| 76 | +``` |
| 77 | + |
| 78 | + |
| 79 | +### How many in epost not in irs_new? filter for md |
| 80 | + |
| 81 | +```{r} |
| 82 | +irs_new<- rename_with(irs_new, tolower) #make names lowercase |
| 83 | +epost <- epost %>% filter(principal_officer_s_address_state == "MD") #filter for only MD |
| 84 | +
|
| 85 | +epost_only <-anti_join(epost,irs_new, by ="ein") |
| 86 | +nrow(epost_only) # not in irs_new |
| 87 | +nrow(irs_new) |
| 88 | +nrow(epost) |
| 89 | +``` |
| 90 | + |
| 91 | + |
| 92 | +## Combine IRS data |
| 93 | + |
| 94 | +```{r} |
| 95 | +
|
| 96 | +irs_epost <-full_join(epost, irs_new, by = "ein") |
| 97 | +
|
| 98 | +IRS <-left_join(irs_new, epost, by = "ein") # keeps all rows of IRS and adds info from epost where possible as new columns |
| 99 | +nrow(IRS) == nrow(irs_new)# test if dimensions are still the same |
| 100 | +``` |
| 101 | + |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +## Revocations |
| 107 | + |
| 108 | +https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads --> click on automatic revocation or exemption list link |
| 109 | + |
| 110 | +Direct link downloaded 5_8_25 (this dataset will change overtime): https://apps.irs.gov/pub/epostcard/data-download-revocation.zip |
| 111 | + |
| 112 | +last updated Dec,9, 2024 |
| 113 | + |
| 114 | +Under Internal Revenue Code Section 6033(j)(1)(A), the IRS will revoke the tax-exempt status of nonprofit organizations automatically when organizations with a filing requirement do not file their required Forms 990 for three consecutive years. When such a revocation occurs, it is effective as of the filing deadline, which is typically May 15 for organizations filing on a calendar year basis. This year, however, the Form 990 filing deadline was delayed to July 15 due to the COVID-19 pandemic |
| 115 | + |
| 116 | +Organizations that do not file a required annual information return or notice for three consecutive years automatically lose their tax-exempt status by operation of law. An automatic revocation is effective on the original filing due date of the third annual return or notice (the "revocation date"). Due to the COVID-19 emergency, this year the IRS extended the filing dates for these returns and notices due from April 1 through July 14 to July 15, 2020. Organizations eligible for this relief that failed to file for the two previous years and did not file by July 15 have automatically lost their tax-exempt status. Due to systemic limitations, these organizations appear on the auto-revocation list showing a revocation date between April 1 and July 14, 2020. However, the revocation date for these organizations is July 15, 2020. For more information on automatic revocation, including how to request reinstatement, see Automatic revocation - How to have your tax-exempt status reinstated. |
| 117 | + |
| 118 | +more info: https://www.irs.gov/pub/irs-pdf/p4991.pdf |
| 119 | + |
| 120 | + |
| 121 | + |
| 122 | +```{r} |
| 123 | +revocations <- read_delim("New_version_data/data-download-revocation_3.txt", |
| 124 | + delim = "|", escape_double = FALSE, col_names = FALSE, |
| 125 | + trim_ws = TRUE) |
| 126 | +revocations_new <- read_delim("New_version_data/data-download-revocation_5_8_25.txt", |
| 127 | + delim = "|", escape_double = FALSE, col_names = FALSE, |
| 128 | + trim_ws = TRUE) |
| 129 | +head(revocations) |
| 130 | +colnames(revocations) <- c("ein", "rev_org_name", "address1", "address2", "city", "state", "zip", "country", "some_rev_code", "rev_date1", "rev_date2", "not_sure") |
| 131 | +IRS_with_rev <-left_join(IRS, revocations, suffix = c("irs", "rev"), by = "ein") |
| 132 | +``` |
| 133 | + |
| 134 | +# Prepare for Geocoding |
| 135 | + |
| 136 | +```{r} |
| 137 | +library(ggmap) |
| 138 | +IRS <- IRS_with_rev %>% unite("address",street:zipirs, remove = FALSE, sep = ", ") # this next step is slow so commenting out, making address variable |
| 139 | +saveRDS(IRS, file = "New_version_data/IRS_data_before_lat_long.rds") |
| 140 | +``` |
| 141 | + |
| 142 | + |
0 commit comments