Data Science Python Cheat Sheet
Data Science Python Cheat Sheet
i - integer
b - boolean
u - unsigned integer
f - float
c - complex float
m - timedelta
M - datetime
O - object
S - string
U - unicode string
V - fixed chunk of memory for other type ( void )
np.array_split(arr, n, axis=1) : to divide array in n no. of array (also check hsplit, vsplit and dsplit)
matplotlib.pyplot as plt
plt.plot(x,y,label=“linename”, marker = ‘’, color = ‘’, linestyle = ‘’, lw=) : plots a graph using arrays
x and y
plt.plot(x1,y1,x2,y2) : plots a graph of 2 lines
plt.xlabel(“labname”) and plt.ylabel(“labname”) : to put labels for axis
plt.title(“title”) : give title
plt.xlim(n,m) and plt.ylim(n,m) : to set upper and lower limit of graph
plt.axis(y0,yn,x0,xn) : to set upper and lower limit
plt.show() : show graph
plt.grid() : show grid lines in graph (use grid(axis=‘x|y’))
plt.subplot(x,y,z) : plots many graphs (x:no of rows, y: no of columns, z: graph no.)
plt.legend(title=“title”,loc=“”) : to show legend in the graph
ply.figure(figsize=(n,m)) : to set graph size
pd.Series([values], index = [values], columns = [values]) : to create a series like np.array(), but also
like dictionary
arr.index : to display index
arr.values : to display value
arr.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True) : to count each
value
arr.apply(np.fuction) : to apply functions of numpy to each value of panda array
tab[‘colname’] : to access the give column (can be used to create a new column, like tab[‘newcol’] =
tab.col1 + ’,’ + tab.col2)
tab.colname : to access the give column
tab.shape : no. of rows and columns
tab.describe(include= ‘all’) : describe data in table (use describe(include= ‘all’) to get all data)
tab.info() : describe table
tab.dtypes : data type of each column
tab.col.value_counts(normalize=True, dropna=True) : to count number of rows of the column
tab.plot(kind= ‘’,x=“namexplot”,y=“nameyplot”) : to plot on graph (use with matplotlib)
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf' | ‘linear’ | ‘poly’, c = [0.01,0.1,1,10], gamma = [0.01,0.1,1])
classifier.fit(x_train, y_train) : to get svm
y_pred = classifier.predict(x_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=n) : to get with n neighbors
classifier.fit(x_train, y_train) : to get K neighbor Classification
y_pred = classifier.predict(x_test)
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = ‘gini’ | ‘entropy’, max_depth = [2,3,4],
min_samples_split = [int])
classifier.fit(x_train, y_train) : to get decision tree classification
y_pred = classifier.predict(x_test)
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion = ‘gini’ | ‘entropy’, n_estimators = [int], max_depth = [3,4],
min_samples_split = [5,7], random_state=43)
clr_rf = clf_rf.fit(x_train,y_train) : to get random forest
y_pred = clr_rf.predict(x_test)
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train) : to get XGB classification
y_pred = classifier.predict(x_test)
from sklearn.svm import SVR
clf = SVR()
clf.fit(X_train, y_train) : to run svr
predicted = clf.predict(X_test)
import xgboost as xgb
clf = xgb.XGBRegressor()
clf.fit(X_train, y_train) : to get XGB regression
predicted = clf.predict(X_test)
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train) : to get decision tree regression
predicted = clf.predict(X_test)
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(X_train, y_train) : to get random forest regression
predicted = clf.predict(X_test)
from sklearn.metrics import f1_score, accuracy_score
ac = accuracy_score(y_test,clf_rf.predict(x_test)) : to get accuracy score
from sklearn.feature_selection import SelectKBest, chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
print('Score list:', select_feature.scores_) : to get best scores
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_3 = RandomForestClassifier()
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train) : to get RFE ranks
from sklearn.feature_selection import RFECV
# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier()
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,scoring='accuracy') #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_]) : to get rfecv
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
tab[‘col’] = lb.fit_transform(tab[‘col’])
for i in tab.columns:
tab[i] = lb.fit_transform(tab[i]) : to convert any entry of values to int form
from sklearn.preprocessing import OneHotEncoder
ob = OneHotEncoder()
for i in tab.columns:
tab[i] = ob.fit_transform(tab[i]) : to convert any entry of values to int form columns
from sklearn.manifold import TSNE
tn = TSNE(n_components=2, random_state=0)
xn = tn.fit_transform(tab)
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(model, parameters, cv=n, scoring= ‘f1_macro’)
gs.fit(X_train,Y_train)
lt = LeaveOneOut()
for train, test in lt.split(n):
print("Train: ", n[train], " Test: ", n[test])