Technique 5: 5_itertuples_in_for_loop
# pre-allocate a `val` array of the appropriate size
val = [np.NAN]*len(df)
# Now iterate over all rows in the dataframe, and populate `val`
for row in df.itertuples():
val[row.Index] = calculate_val(
row.A_i_minus_2,
row.A_i_minus_1,
row.A,
row.A_i_plus_1,
row.B,
row.C,
row.D,
)
df["val"] = val # put this column back into the dataframe
Technique 6: 6_vectorization__with_apply_for_if_statement_corner_case
def calculate_new_column_b_value(b_value):
# Python ternary operator
b_value_new = (6 * b_value) if b_value > 0 else (60 * b_value)
return b_value_new
# In this particular example, since we have an embedded `if-else` statement
# for the `B` column, pure vectorization is less intuitive. So, first we'll
# calculate a new `B` column using
# **`apply()`**, then we'll use vectorization for the rest.
df["B_new"] = df["B"].apply(calculate_new_column_b_value)
# OR (same thing, but with a lambda function instead)
# df["B_new"] = df["B"].apply(lambda x: (6 * x) if x > 0 else (60 * x))
# Now we can use vectorization for the rest. "Vectorization" in this case
# means to simply use the column series variables in equations directly,
# without manually iterating over them. Pandas DataFrames will handle the
# underlying iteration automatically for you. You just focus on the math.
df["val"] = (
2 * df["A_i_minus_2"]
+ 3 * df["A_i_minus_1"]
+ 4 * df["A"]
+ 5 * df["A_i_plus_1"]
+ df["B_new"]
+ 7 * df["C"]
- 8 * df["D"]
)
Technique 7: 7_vectorization__with_list_comprehension_for_if_statment_corner_case
# In this particular example, since we have an embedded `if-else` statement
# for the `B` column, pure vectorization is less intuitive. So, first we'll
# calculate a new `B` column using **list comprehension**, then we'll use
# vectorization for the rest.
df["B_new"] = [
calculate_new_column_b_value(b_value) for b_value in df["B"]
]
# Now we can use vectorization for the rest. "Vectorization" in this case
# means to simply use the column series variables in equations directly,
# without manually iterating over them. Pandas DataFrames will handle the
# underlying iteration automatically for you. You just focus on the math.
df["val"] = (
2 * df["A_i_minus_2"]
+ 3 * df["A_i_minus_1"]
+ 4 * df["A"]
+ 5 * df["A_i_plus_1"]
+ df["B_new"]
+ 7 * df["C"]
- 8 * df["D"]
)
Technique 8: 8_pure_vectorization__with_df.loc[]_boolean_array_indexing_for_if_statment_corner_case
This uses boolean indexing, AKA: a boolean mask, to accomplish the equivalent of the if statement in the equation. In this way, pure vectorization can be used for the entire equation, thereby maximizing performance and speed.
# If statement to evaluate:
#
# if B > 0:
# B_new = 6 * B
# else:
# B_new = 60 * B
#
# In this particular example, since we have an embedded `if-else` statement
# for the `B` column, we can use some boolean array indexing through
# `df.loc[]` for some pure vectorization magic.
#
# Explanation:
#
# Long:
#
# The format is: `df.loc[rows, columns]`, except in this case, the rows are
# specified by a "boolean array" (AKA: a boolean expression, list of
# booleans, or "boolean mask"), specifying all rows where `B` is > 0. Then,
# only in that `B` column for those rows, set the value accordingly. After
# we do this for where `B` is > 0, we do the same thing for where `B`
# is <= 0, except with the other equation.
#
# Short:
#
# For all rows where the boolean expression applies, set the column value
# accordingly.
#
# GitHub CoPilot first showedOfficial medocumentation thison `.loc[]` technique.
# See also the official documentation:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
#
# ===========================
# 1st: handle the > 0 case
# ===========================
df["B_new"] = df.loc[df["B"] > 0, "B"] * 6
#
# ===========================
# 2nd: handle the <= 0 case, merging the results into the
# previously-created "B_new" column
# ===========================
# - NB: this does NOT work; it overwrites and replaces the whole "B_new"
# column instead:
#
# df["B_new"] = df.loc[df["B"] <= 0, "B"] * 60
#
# This works:
df.loc[df["B"] <= 0, "B_new"] = df.loc[df["B"] <= 0, "B"] * 60
# Now use normal vectorization for the rest.
df["val"] = (
2 * df["A_i_minus_2"]
+ 3 * df["A_i_minus_1"]
+ 4 * df["A"]
+ 5 * df["A_i_plus_1"]
+ df["B_new"]
+ 7 * df["C"]
- 8 * df["D"]
)
Technique 9: 9_apply_function_with_lambda
df["val"] = df.apply(
lambda row: calculate_val(
row["A_i_minus_2"],
row["A_i_minus_1"],
row["A"],
row["A_i_plus_1"],
row["B"],
row["C"],
row["D"]
),
axis='columns' # same as `axis=1`: "apply function to each row",
# rather than to each column
)
Technique 10: 10_list_comprehension_w_zip_and_direct_variable_assignment_passed_to_func
df["val"] = [
# Note: you *could* do the calculations directly here instead of using a
# function call, so long as you don't have indented code blocks such as
# sub-routines or multi-line if statements.
#
# I'm using a function call.
calculate_val(
A_i_minus_2,
A_i_minus_1,
A,
A_i_plus_1,
B,
C,
D
) for A_i_minus_2, A_i_minus_1, A, A_i_plus_1, B, C, D
in zip(
df["A_i_minus_2"],
df["A_i_minus_1"],
df["A"],
df["A_i_plus_1"],
df["B"],
df["C"],
df["D"]
)
]
Technique 11: 11_list_comprehension_w_zip_and_direct_variable_assignment_calculated_in_place
df["val"] = [
2 * A_i_minus_2
+ 3 * A_i_minus_1
+ 4 * A
+ 5 * A_i_plus_1
# Python ternary operator; don't forget parentheses around the entire
# ternary expression!
+ ((6 * B) if B > 0 else (60 * B))
+ 7 * C
- 8 * D
for A_i_minus_2, A_i_minus_1, A, A_i_plus_1, B, C, D
in zip(
df["A_i_minus_2"],
df["A_i_minus_1"],
df["A"],
df["A_i_plus_1"],
df["B"],
df["C"],
df["D"]
)
]
Technique 12: 12_list_comprehension_w_zip_and_row_tuple_passed_to_func
df["val"] = [
calculate_val(
row[0],
row[1],
row[2],
row[3],
row[4],
row[5],
row[6],
) for row
in zip(
df["A_i_minus_2"],
df["A_i_minus_1"],
df["A"],
df["A_i_plus_1"],
df["B"],
df["C"],
df["D"]
)
]
Technique 13: 13_list_comprehension_w__to_numpy__and_direct_variable_assignment_passed_to_func
df["val"] = [
# Note: you *could* do the calculations directly here instead of using a
# function call, so long as you don't have indented code blocks such as
# sub-routines or multi-line if statements.
#
# I'm using a function call.
calculate_val(
A_i_minus_2,
A_i_minus_1,
A,
A_i_plus_1,
B,
C,
D
) for A_i_minus_2, A_i_minus_1, A, A_i_plus_1, B, C, D
# Note: this `[[...]]` double-bracket indexing is used to select a
# subset of columns from the dataframe. The inner `[]` brackets
# create a list from the column names within them, and the outer
# `[]` brackets accept this list to index into the dataframe and
# select just this list of columns, in that order.
# - See the official documentation on it here:
# https://pandas.pydata.org/docs/user_guide/indexing.html#basics
# - Search for the phrase "You can pass a list of columns to [] to
# select columns in that order."
# - I learned this from this comment here:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas/55557758#comment136020567_55557758
# - One of the **list comprehension** examples in this answer here
# uses `.to_numpy()` like this:
# https://stackoverflow.com/a/55557758/4561887
in df[[
"A_i_minus_2",
"A_i_minus_1",
"A",
"A_i_plus_1",
"B",
"C",
"D"
]].to_numpy() # NB: `.values` works here too, but is deprecated. See:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.values.html
]