- 1 Load libraries
- 2 Municipalities data
- 3 General elections data
- 4 Budget data
- 4.1 Importing and manipulating Sela budget data from Open Budget for 2016-2020
- 4.2 Importing and manipulating Sela budget data from the Ministry of Culture for 2016-2019
- 4.3 Getting conversion table between tax municipal id and CBS municipal id
- 4.4 Adding municipality id to a data frame with budget data
- 5 SELA adiitional data
- 6 Combining all data sources into one data frame
- 7 Future Code that is not operatable right now
library(tidyverse)
library(readxl)
library(httr)
library(rvest)
locale("he")
This is a function that gets a url and returns a tibble. First, it extracts the file extension with a regular expression, and then downloads the file with the url parameter. later, it reads the two lines of names of variables and handles each one of them separately. The upper row gets filled with previous variable names for NAs because of merged cells in the original table. The lower row gets blank string for NAs. When concatenating, if there is a second argument for the variable, the variable name gets padded with blank space between its two arguments.
read_muni_new <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df_head <- read_excel(tf, sheet = 2, skip = 3, n_max = 2, col_names = FALSE)
col_names_1 <- df_head %>%
slice(1) %>%
pivot_longer(everything()) %>%
select(value) %>%
fill(value)
col_names_2 <- df_head %>%
slice(2) %>%
pivot_longer(everything()) %>%
select(value) %>%
replace_na(list(value = ""))
df_vars <- bind_cols(col_names_1, col_names_2) %>%
mutate(
var_names = if_else(str_length(value...2) > 0,
str_c(value...1, " ", value...2),
str_c(value...1, value...2))
)
var_names <- df_vars %>% pull(var_names)
df_whole <- read_excel(tf, sheet = 2, skip = 5, col_names = var_names)
return(df_whole)
}
This function is important because some SELA data is using population data older than 2018. This function takes the url of the file, the wanted column numbers for the cities and for the regional councils. It returns a tibble with a municipality id and the wanted variable values.
read_muni_old <- function(url, col_num_1, col_num_2){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df1 <- read_excel(tf, sheet = 2, skip = 1)
df2 <- read_excel(tf, sheet = 4, skip = 1)
df1 <- df1 %>%
select(
muni_id = 2,
var = col_num_1
) %>%
filter(str_length(muni_id) == 4)
df2 <- df2 %>%
select(
muni_id = 2,
var = col_num_2
) %>%
filter(str_length(muni_id) == 2)
df <- bind_rows(df1, df2)
df
}
The function gets the 2021 yishuvim file from CBS, cleans it, and returns a tibble of id’s of yishuvim by id’s of municipality. the values are all text; regional councils have 2-numbers text id’s, and yishuvim and other municipalities have 4-numbers text id’s. If there is no municipality id, it means that the yishuv is either unrecognized (for example, some Bedouin people in the Negev) or it is some sort of place that is not under any municipality (for example, Mikveh Israel farm).
get_yishuv_muni <- function(){
url <- "https://www.cbs.gov.il/he/publications/doclib/2019/ishuvim/bycode2021.xlsx"
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
yishuvim <- read_excel(tf, col_types = "text")
yishuvim <- yishuvim %>%
select(
yishuv_id = 2,
nafa_id = 6,
muni_id = 9
) %>%
mutate(
yishuv_id = str_pad(yishuv_id, width = 4, side = "left", pad = "0"),
muni_id = case_when(
(muni_id == "0" | muni_id == "99") ~ yishuv_id,
TRUE ~ str_pad(muni_id, width = 2, side = "left", pad = "0")
)
)
yishuvim
}
important to note this function is currently only applicable to excel files
read_elec_general <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df <- read_excel(tf)
df
}
The function receives a data frame and the column number of the yishuv id as arguments. First, it calls the data frame that links between yishuvim and municipalities. Then, it converts the yishuv_id from the argument to a character vector, pads it with 0’s to fit the format, and adds the municipalities id’s by the yishuv id. The function returns the original data frame with two additional columns: the formatted yishuv id and the added municipality id.
match_yishuv_muni <- function(data, id_col_num){
df_keys <- get_yishuv_muni() %>%
select(-nafa_id)
data %>%
mutate(
yishuv_id = as.character(pull(., id_col_num)),
yishuv_id = str_pad(yishuv_id, width = 4, side = "left", pad = "0")
) %>%
left_join(df_keys, by = ("yishuv_id"))
}
The function receives a data frame of the elections with municipal id for every voting site. It renames the relevant total and party-specific variables, groups by every municipality, and calculates summary statistics: voting percentage for HaLikud party as a part of total votes, voting percentage for coalition parties (Israel Betetny was included even though they left the coalition 1 year prior to elections), total potential votes and total good votes. the function returns the summarized data frame. NA’s are yishuvim not under any municipality, the only municipality with no votes is Ein Kinya (muni_id 4502).
get_elect_pct <- function(data){
data %>%
rename(
pot_votes = 4,
good_votes = 7,
yahadut_hatorah = 9,
habait_hayehudi = 14,
kulanu = 19,
israel_beytenu = 20,
halikud = 21,
shas = 33
) %>%
group_by(muni_id) %>%
summarize(
likud_pct = 100 * sum(halikud) / sum(good_votes),
coal_pct = 100 * sum(
yahadut_hatorah +
habait_hayehudi +
kulanu +
israel_beytenu +
halikud +
shas
) / sum(good_votes),
pot_votes = sum(pot_votes),
good_votes = sum(good_votes)
)
}
The function reads the csv file from The Open Budget website. It then selects and renames relevant variables, replaces NA’s with 0’s, summarizes by year and municipality, and filters for the relevant year from the parameter. It returns the summarized data frame. Important to note the current url method is pretty brutal and further programming is needed here, maybe working with their API.
get_sela_data_open <- function(year_param){
url <- "https://next.obudget.org/api/download?query=SELECT%20year_requested%20AS%20%22%D7%A9%D7%A0%D7%94%22%2C%20supporting_ministry%20AS%20%22%D7%9E%D7%A9%D7%A8%D7%93%22%2C%20request_type%20AS%20%22%D7%A1%D7%95%D7%92%20%D7%AA%D7%9E%D7%99%D7%9B%D7%94%22%2C%20support_title%20AS%20%22%D7%A0%D7%95%D7%A9%D7%90%22%2C%20budget_code%2C%20budget_code%20AS%20%22%D7%9E%D7%A1%D7%A4%D7%A8%20%D7%AA%D7%A7%D7%A0%D7%94%22%2C%20%27supports%2F%27%20%7C%7C%20budget_code%20%7C%7C%20%27%2F%27%20%7C%7C%20year_requested%20%7C%7C%20%27%2F%27%20%7C%7C%20entity_id%20%7C%7C%20%27%2F%27%20%7C%7C%20request_type%20AS%20item_id%2C%20coalesce(entity_name%2C%20recipient)%20as%20%22%D7%9E%D7%A7%D7%91%D7%9C%20%D7%94%D7%AA%D7%9E%D7%99%D7%9B%D7%94%22%2C%20entity_id%20as%20%22%D7%9E%D7%A1%D7%A4%D7%A8%20%D7%AA%D7%90%D7%92%D7%99%D7%93%22%2C%20%27org%2F%27%20%7C%7C%20entity_kind%20%7C%7C%20%27%2F%27%20%7C%7C%20entity_id%20as%20entity_item_id%2C%20sum(amount_approved)%20as%20%22%D7%A1%D7%94%D7%B4%D7%9B%20%D7%90%D7%95%D7%A9%D7%A8%22%2C%20sum(amount_paid)%20as%20%22%D7%A1%D7%94%D7%B4%D7%9B%20%D7%A9%D7%95%D7%9C%D7%9D%22%20FROM%20raw_supports%20WHERE%20year_requested%20%3E0%20AND%20budget_code%20%3D%20%270019420256%27%20GROUP%20BY%201%2C%202%2C%203%2C%204%2C%205%2C%206%2C%207%2C%208%2C%209%2C%2010%20order%20by%20year_requested%20desc&format=csv&filename=%D7%A1%D7%9C%20%D7%9C%D7%AA%D7%A8%D7%91%D7%95%D7%AA%20%D7%A2%D7%99%D7%A8%D7%95%D7%A0%D7%99%D7%AA__%20%D7%A4%D7%99%D7%A8%D7%95%D7%98%20%D7%9B%D7%9C%20%D7%94%D7%AA%D7%9E%D7%99%D7%9B%D7%95%D7%AA%20%D7%9E%D7%AA%D7%A7%D7%A6%D7%99%D7%91%20%D7%96%D7%94%20%D7%A9%D7%90%D7%95%D7%A9%D7%A8%D7%95%20%D7%91%20%D7%9B%D7%9C%20%D7%94%D7%A9%D7%A0%D7%99%D7%9D&headers=%D7%A0%D7%95%D7%A9%D7%90%3Aitem_link(item_id)%3B%D7%9E%D7%A1%D7%A4%D7%A8%20%D7%AA%D7%A7%D7%A0%D7%94%3Abudget_code%3Asearch_term(budget_code)%3B%D7%9E%D7%A9%D7%A8%D7%93%3B%D7%A1%D7%95%D7%92%20%D7%AA%D7%9E%D7%99%D7%9B%D7%94%3B%D7%9E%D7%A7%D7%91%D7%9C%20%D7%94%D7%AA%D7%9E%D7%99%D7%9B%D7%94%3Aitem_link(entity_item_id)%3B%D7%9E%D7%A1%D7%A4%D7%A8%20%D7%AA%D7%90%D7%92%D7%99%D7%93%3B%D7%A9%D7%A0%D7%94%3B%D7%A1%D7%94%D7%B4%D7%9B%20%D7%90%D7%95%D7%A9%D7%A8%3Anumber%3B%D7%A1%D7%94%D7%B4%D7%9B%20%D7%A9%D7%95%D7%9C%D7%9D%3Anumber"
read_csv(url) %>%
select(
tax_name = 5,
tax_id = 6,
year = 7,
budget_approved = 8,
budget_paid = 9
) %>%
replace_na(
list(budget_approved = 0, budget_paid = 0)
) %>%
group_by(tax_id, year) %>%
summarise(
budget_approved = sum(budget_approved),
budget_paid = sum(budget_paid)
) %>%
ungroup() %>%
filter(year == year_param)
}
check2 <- get_sela_data_open(2018)
This function reads the XLSX file from the ministry of culture official website1, reads the specific SELA sheet, removes the first totals row, selects and renames the relevant columns, pivots the data to a longer format by year and budget type (initiatives, festivals or total), replaces NA’s with 0’s, filters for the needed year by parameter, and pivots back to the wider format to distinguish easily between different budget types.
get_sela_data_culture <- function(year_param){
url <- "https://www.gov.il/BlobFolder/generalpage/ministry_support/he/%D7%AA%D7%9E%D7%99%D7%9B%D7%95%D7%AA%20%D7%94%D7%9E%D7%A9%D7%A8%D7%93%20%D7%9C%D7%92%D7%95%D7%A4%D7%99%20%D7%AA%D7%A8%D7%91%D7%95%D7%AA%202016-2019.xlsx"
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df <- read_excel(tf, sheet = 39)
df %>%
slice_tail(n = -1) %>%
select(
tax_id = 1,
tax_name = 2,
init_2016 = 3,
fest_2016 = 4,
tot_2016 = 5,
init_2017 = 7,
fest_2017 = 8,
tot_2017 = 9,
init_2018 = 11,
fest_2018 = 12,
tot_2018 = 13,
init_2019 = 15,
fest_2019 = 16,
tot_2019 = 17
) %>%
pivot_longer(-c(tax_id, tax_name), names_to = c("budget_type", "year"), names_sep = "_", values_to = "budget") %>%
replace_na(list(budget = 0)) %>%
mutate(
tax_id = as.numeric(tax_id),
year = as.numeric(year)
) %>%
filter(year == year_param) %>%
pivot_wider(c(tax_id, year), names_from = "budget_type", names_prefix = "budget_approved_", values_from = "budget")
}
check <- get_sela_data_culture(2018)
This function reads the csv file from my GitHub repository, selects the two relevant variables, and returns the data frame.
get_muni_id_conv <- function(){
read_csv("https://raw.githubusercontent.com/matanhakim/general_files/main/muni_ids.csv") %>%
select(
cbs_id,
tax_id
)
}
match_budget_muni <- function(data){
df_keys <- get_muni_id_conv()
data %>%
left_join(df_keys, by = "tax_id") %>%
select(
-tax_id,
-year
)
}
This function reads the 2013 CBS SES data for municipalities that is being used by 2018 SELA regulations to determine eligibility of municipalities. It reads the file, selects the relevant variables, removes excess rows, and transforms the id to the usual format.
read_ses_2013 <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df <- read_excel(tf, skip = 6)
df %>%
slice(2:256) %>%
select(
muni_status = 1,
muni_id = 2,
heb_name = 3,
ses_2013_i = 5,
ses_2013_r = 6,
ses_2013_c = 7
) %>%
mutate(
muni_id = as.character(muni_id),
muni_id = case_when(
(muni_status == "0" | muni_status == "99") ~ str_pad(muni_id, width = 4, side = "left", pad = "0"),
TRUE ~ str_pad(muni_status, width = 2, side = "left", pad = "0")
)
) %>%
select(-c(muni_status, heb_name))
}
Important to note that this is and old indicator, therefore since then some municipal jurisdiction changes have happened:
read_peri_2004 <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df <- read_excel(tf, skip = 7)
df %>%
select(
muni_id = 1,
heb_name = 2,
peri_2004_i = 9,
peri_2004_r = 10,
peri_2004_c = 11
) %>%
mutate(
muni_id = as.character(muni_id),
muni_id = case_when(
str_length(muni_id) == 5 ~ str_sub(muni_id, start = -2),
TRUE ~ str_pad(muni_id, width = 4, side = "left", pad = "0")
)
) %>%
select(-heb_name)
}
read_ses_sa_2008 <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df <- read_excel(tf, skip = 5)
df %>%
slice_head(n = -3) %>%
select(
muni_id = 2,
sa_id = 4,
ses_sa_2008_i = 7,
ses_sa_2008_r = 8,
ses_sa_2008_c = 9
) %>%
mutate(
muni_id = str_pad(as.character(muni_id), width = 4, side = "left", pad = "0")
) %>%
nest(-muni_id) %>%
rename(sa_data = data)
}
Since the SELA budget relies also on national priority areas, these data are needed to be imported. ### Getting the list of tables from the national priority webpage This function reads the table data in the national priority areas government decision webpage, and returns a list of those tables.
get_nat_pri_list <- function(){
nat_pri_url <- "https://www.gov.il/he/departments/policies/2013_des667"
read_html(nat_pri_url) %>%
html_elements("table") %>%
html_table()
}
This function reads the tables from the previous section and manipulates them: - The nafot (subdistricts) data is added with the corresponding nafa_id column, and converts Hebrew data to logical. - The yishuvim declared as national pririty are cleaned, added with a TRUE column and formats the yishuv_id. - The yishuvim declared as close to the border or threatened are cleaned, added with a TRUE column and formats the yishuv_id. - The whole yishuvim list is being caled from the CBS website, and then all other three data frames are joined. NA’s are replaced with FALSE, and a final national priority variable for each yishuv is calculated. - Finally, yishuvim with NA as municipality are filtered out, and a final national priority variable for each municipality is calculated as having either more than 75% of yishuvim in the municipality as national priority, or more than 50% of yishuvim in the municipality as close to the border or threatened.
get_nat_pri_munis <- function(){
pri_list <- get_nat_pri_list()
pri_nafot <- pri_list[[1]] %>%
add_column(nafa_id = c(29,21,24,62,22,23,71,32,11,61,31,41,44,43,42,51)) %>%
mutate(
nafa_id = as.character(nafa_id),
nafa_nat_pri = (X7 == "כן")
) %>%
select(nafa_id, nafa_nat_pri)
pri_yishuvim <- pri_list[[2]] %>%
select(yishuv_id = 1) %>%
slice_tail(n = -1) %>%
add_column(yishuv_nat_pri = TRUE) %>%
mutate(yishuv_id = str_pad(yishuv_id, width = 4, side = "left", pad = "0"))
pri_border <- pri_list[[3]] %>%
select(yishuv_id = 1) %>%
slice_tail(n = -1) %>%
add_column(border_nat_pri = TRUE) %>%
mutate(yishuv_id = str_pad(yishuv_id, width = 4, side = "left", pad = "0"))
pri_df <- get_yishuv_muni() %>%
left_join(pri_nafot, by = "nafa_id") %>%
left_join(pri_yishuvim, by = "yishuv_id") %>%
left_join(pri_border, by = "yishuv_id") %>%
mutate(
across(ends_with("pri"), ~ replace_na(., FALSE)),
is_nat_pri = nafa_nat_pri | yishuv_nat_pri | border_nat_pri
)
pri_df %>%
filter(!is.na(muni_id)) %>%
group_by(muni_id) %>%
summarise(is_nat_pri = (mean(is_nat_pri) > 0.75) | (mean(border_nat_pri) >= 0.5))
}
muni_2018_url <- "https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2018.xlsx" # Initializing the url for 2018 municipalities data
elec_url <- "https://bechirot22.bechirot.gov.il/election/Documents/%D7%91%D7%97%D7%99%D7%A8%D7%95%D7%AA%20%D7%A7%D7%95%D7%93%D7%9E%D7%95%D7%AA/results_20.xls" # Initializing the url for 2015 elections
ses_2013_url <- "https://www.cbs.gov.il/he/publications/doclib/2017/socio_eco13_1694/t02.xls" # Initializing the url for 2013 CBS SES data
peri_2004_url <- "https://www.cbs.gov.il/he/mediarelease/doclib/2008/160/24_08_160t2.xls" # Initializing the url for 2004 CBS periphery data
ses_sa_2008_url <- "https://www.cbs.gov.il/he/mediarelease/doclib/2013/087/24_13_087t6.xls" # Initializing the url for 2008 CBS atatistical areas data
pop_2015_url <- "https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2015.xls" # Initializing the url for 2015 CBS population data
muni_df <- read_muni_new(muni_2018_url) %>%
rename(muni_id = 2)
elec_df <- read_elec_general(elec_url) %>% # Reading the general elections raw data
match_yishuv_muni(2) %>% # Adding the linked yishuvim and municipalities to elections data
get_elect_pct() # Calculating elections data by municipality
budget_open_df <- get_sela_data_open(2018) %>% # Reading and manipulating Sela open budget data by year and municipality
match_budget_muni() # Adding the cbs municipality id
budget_culture_df <- get_sela_data_culture(2018) %>% # Reading and manipulating Sela ministry of culture budget data by year and municipality
match_budget_muni() # Adding the cbs municipality id
ses_2013_df <- read_ses_2013(ses_2013_url) # Reading 2013 CBS SES data
peri_2004_df <- read_peri_2004(peri_2004_url) # Reading 2004 CBS periphery data
ses_sa_2008_df <- read_ses_sa_2008(ses_sa_2008_url) # Reading 2008 CBS statistical areas data
nat_pri_df <- get_nat_pri_munis() # Reading 2013 national priority areas government decision for municipalities
pop_2015_df <- read_muni_old(pop_2015_url, 14, 31) %>%
rename(pop_2015 = var)# Reading the 2015 CBS population data
raw_df <- muni_df %>%
left_join(elec_df, by = "muni_id") %>%
left_join(budget_open_df, by = c("muni_id" = "cbs_id")) %>%
left_join(budget_culture_df, by = c("muni_id" = "cbs_id")) %>%
mutate( # Change NA's (unbudgeted or no voters municipalities) to 0's
across(
last_col(5:0),
replace_na, 0
)
) %>%
left_join(ses_2013_df, by = "muni_id") %>%
left_join(peri_2004_df, by = "muni_id") %>%
left_join(ses_sa_2008_df, by = "muni_id") %>%
left_join(nat_pri_df, by = "muni_id") %>%
left_join(pop_2015_df, by = "muni_id")
Error in `mutate()`:
! Problem while computing `..1 = across(last_col(5:0), replace_na, 0)`.
Caused by error in `across()`:
! Problem while evaluating `last_col(5:0)`.
Caused by error in `last_col()`:
! `offset` must be a single integer, not an integer vector.
Right now there is an error
bind_muni_2016_2019 <- function(){
url_2016 <- "https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2016.xlsx"
url_2017 <- "https://www.cbs.gov.il/he/mediarelease/doclib/2019/057/24_19_057t1.xlsx"
url_2018 <- "https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2018.xlsx"
url_2019 <- "https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2019.xlsx"
df_2016 <- read_muni_new(url_2016)
df_2017 <- read_muni_new(url_2017)
df_2018 <- read_muni_new(url_2018)
df_2019 <- read_muni_new(url_2019)
muni_df <- bind_rows(
df_2016,
df_2017#,
# df_2018,
# df_2019
)
muni_df
}
I read the municipalities file by url, extracts the two tables of local municipalities and regional councils, and merges them. Right now there is an error because of the merging process.
read_muni_old <- function(url){
file_ext <- str_extract(url, "[0-9a-z]+$")
GET(url, write_disk(tf <- tempfile(fileext = file_ext)))
df2 <- read_excel(tf, sheet = 2, skip = 1)
df2 <- df2 %>% drop_na(3)
df3 <- read_excel(tf, sheet = 4, skip = 1)
df3 <- df3 %>% drop_na(3)
df <- bind_rows(df2, df3)
return(df)
}
# df <- read_muni_old("https://www.cbs.gov.il/he/publications/doclib/2019/hamakomiot1999_2017/2015.xls")