I have a dataframe which contains information of programmers like: country, programming languages. etc:
COUNTRY PROGRAMMING_LANGUAGE
usa javascript
uk python;swift;kotlin
india python;ruby
usa c++;c;assembly;python
canada java;php;golang;ruby
angola python;c#
india c;java
brazil javascript;php
canada php;sql
india c#;java
brazil java;javascript
russia java;kotlin
china javascript
usa python;c;c++
india ruby
australia javascrit
india php;java
china swift;kotlin
russia php;sql
brazil firebase;kotlin
uk sql;firebase
canada python;c
portugal python;php
My program should display on a dataframe:
- All countries;
- How many people from each country use python;
COUNTRY KNOWS_PYTHON
usa 2
uk 1
india 1
angola 1
canada 1
portugal 1
russia 0
brazil 0
australia 0
china 0
Please share your opinion about my algorithm, in any possible way to improve it:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
df = pd.DataFrame({
"PROGRAMMER":np.arange(0,25),
"AGE":np.array([22,30,np.nan,25,19,27,28,26,33,18,14,np.nan,29,35,19,30,29,24,21,52,np.nan,24,np.nan,18,25],dtype=np.float16),
"COUNTRY":['uSa','Uk','india','usa','Canada','AngOla','India','braZil','canada','india','brazil','russia','china','usa','india',np.nan,'Australia','india','China','russia','brazil','uk','canada','portugal','ChiNa'],
"PROGRAMMING_LANGUAGE":['JAVASCRIPT','python;swift;kotlin','python;ruby','c++;c;assembly;python','java;php;golang;ruby','python;c#','c;java','javascript;php','php;sql','c#;java','java;javascript','java;kotlin','javascript','python;c;c++','ruby',np.nan,'javascrit','php;java','swift;kotlin','php;sql','firebase;kotlin','sql;firebase','python;C','python;php',np.nan],
"GENDER":['male','female','male','male','female','female',np.nan,'male','female','male','male','female','female',np.nan,'female','male','male','male','female','male',np.nan,'male','female','male','male'],
"LED_ZEPPELIN_FAN":['yes','YES','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes',np.nan,'yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes'],
})
#Replacing NaN value as 'missing'
df = df.fillna("missing")
filt = (df['COUNTRY'] != "missing") & (df['PROGRAMMING_LANGUAGE'] != "missing")
table = df.loc[filt,['COUNTRY','PROGRAMMING_LANGUAGE']]
table = table.applymap(str.lower)
table
#This is just a list with all countries(without duplicates), and it will be used later
total_countries = list(set(table['COUNTRY']))
#Filter rows that contain python as programming language
filt = table['PROGRAMMING_LANGUAGE'].str.contains('python',na=False)
table_python = table.loc[filt,['COUNTRY','PROGRAMMING_LANGUAGE']]
#Getting all countries that have programmers that use python(without duplicates)
countries = table_python['COUNTRY'].value_counts().index.tolist()
#Getting the number of programmers from each country that use python(including duplicates from each country)
quantities = []
for i in range(0,len(countries)):
quantities.append(table_python['COUNTRY'].value_counts()[i])
#Comparing the list that contains all countries, with a list of countries that use python.
#If there is a country that doesn't have programmers that use python, these will be added to final with 0 as one of the values
for i in total_countries:
if i not in countries:
countries.append(i)
quantities.append(0)
table_python = pd.DataFrame({"COUNTRY":countries,"KNOWS_PYTHON":quantities})
table_python.set_index('COUNTRY',inplace=True)
table_python