import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


data = pd.read_csv("D:\\Dropbox\\wave\\projects\\scival QS research\\qs\\m_data.csv")
data[:5]


data.loc[data.iloc[:, 4] == "-"] = ""
num = data.iloc[:, 4:11].apply(pd.to_numeric).fillna(0)
num_remove_zero = num.loc[~(num == 0).all(axis = 1)]
mean = np.mean(num_remove_zero, axis = 0)
std = np.std(num_remove_zero, axis = 0)
num_mean = (num_remove_zero - mean) # create mean-shifted dataset
num_n = (num_remove_zero - mean) / std # create standardized dataset


var = ["Views Count", "Citation Count", "Field-Weighted Citation Impact",
        "Scholarly Output", "Academic-Corporate Collaboration (%)", "Citations per Publication",
        "International Collaboration (%)"]
cor_m = pd.DataFrame(np.round(np.cov(num_n.T), 2), columns = var)
cor_m


eig_val, eig_vec = np.linalg.eig(cor_m) # diagonize the correlation matrix and extract eigenvector and eigenvalue
eig_vec = eig_vec.T
print("The EigenValue are:")
print(np.array(np.round(eig_val, 3)))
print("\n The EigenVector are:")
for i in eig_vec:
    print(np.round(np.array(i), 4))

The EigenValue are:
[2.897 1.852 1.005 0.707 0.432 0.063 0.043]

 The EigenVector are:
[0.5731 0.5781 0.0424 0.574  0.0151 0.076  0.0085]
[ 0.054   0.006  -0.6036  0.0762 -0.1875 -0.6003 -0.481 ]
[ 0.0155 -0.0018  0.1443  0.009  -0.9333 -0.1012  0.3122]
[-0.0353  0.0168  0.3518 -0.0381 -0.2601  0.3736 -0.8161]
[-0.0208  0.0259 -0.6994 -0.0419 -0.1607  0.6909  0.0693]
[-0.8135  0.3421 -0.004   0.4698 -0.0042 -0.0152  0.013 ]
[ 0.0705 -0.7401  0.0095  0.6639 -0.0009  0.0803 -0.008 ]


tag = [f"PC{i}" for i in range(1, 8, 1)]
x = np.arange(1, 8, 1)
fig, ax = plt.subplots()
ax.plot(x, eig_val, marker = "o", color = "blue")
ax.set_xticks(x)
ax.set_xticklabels(tag)
ax.set_ylabel("Eigen Value", color = "blue", fontsize = 16)

explained_var = np.cumsum(eig_val) / np.sum(eig_val)
ax2 = ax.twinx()
ax2.plot(x, explained_var, marker = "o", color = "red")
ax2.set_ylabel("Explained Variance", color = "red", fontsize = 16)
plt.show()


num_std_arr = np.array(num_n)

# project the orignial point to new axis 
project_point = []
for i in range(len(num_std_arr)):
    temp = []
    for j in range(len(eig_vec)):
        temp.append(np.sum(num_std_arr[i] * eig_vec[j]))
    project_point.append(temp)

# compute correlation between original points and new points
corr_pc_ori = []
for i in range(7):
    temp = []
    for j in range(7):
        temp.append(np.corrcoef(np.array(project_point).T[i], np.array(num_n).T[j])[0][1])
    corr_pc_ori.append(temp)

corr_pc_ori = pd.DataFrame(np.round(corr_pc_ori, 2), columns = var, index = tag)
corr_pc_ori


data_clean = data.loc[~(data == "").all(axis = 1)]
def stdlize(df):
    columns = ["Views Count", "Citation Count", "Field-Weighted Citation Impact",
        "Scholarly Output", "Academic-Corporate Collaboration (%)", "Citations per Publication",
        "International Collaboration (%)"]
    for i in columns:
        m_v = df[i].apply(pd.to_numeric).mean()
        s_v = df[i].apply(pd.to_numeric).std()
        df[i] = (df[i].apply(pd.to_numeric) - m_v) / s_v
stdlize(data_clean)

def calculate_perform(df):
    columns = ["Scholarly Output", "Citations per Publication",
     "Academic-Corporate Collaboration (%)"]
    dep = set(data_clean["Entity"])
    value_by_dep = []
    for i in dep:
        df_dep = df[df["Entity"] == i]
        sum = []
        for j in columns:
            sum.append(df_dep[j].apply(pd.to_numeric).mean())
        index = 0.506 * sum[0] + 0.33 * sum[1] + 0.173 * sum[2]
        value_by_dep.append([round(index, 2), i])
    return value_by_dep
value_by_dep = sorted(zip(calculate_perform(data_clean)), reverse = True)[1:]
dep = [i[0][1] for i in list(value_by_dep)]
value = [i[0][0] for i in list(value_by_dep)]

dep_abbr = ["BA", "EE", "CSIE", "SoC", "CE"]
fig, ax = plt.subplots()
ax.bar(dep_abbr, value[:5])
ax.set_title("Top 5 Publication Performance")
plt.show()


def calculate_perform_sub(df):
    columns = ["Scholarly Output", "Citations per Publication",
     "Academic-Corporate Collaboration (%)"]
    sub = set(data_clean["subject"])
    value_by_sub = []
    for i in sub:
        df_sub = df[df["subject"] == i]
        sum = []
        for j in columns:
            sum.append(df_sub[j].apply(pd.to_numeric).mean())
        index = 0.506 * sum[0] + 0.33 * sum[1] + 0.173 * sum[2]
        value_by_sub.append([round(index, 2), i])
    return value_by_sub
value_by_sub = sorted(zip(calculate_perform_sub(data_clean)), reverse = True)
subb = [i[0][1] for i in list(value_by_sub)]
value_s = [i[0][0] for i in list(value_by_sub)]


df_result_sub = pd.DataFrame({
    "Subject" : subb,
    "Publication Performance" : value_s
})
print(df_result_sub)

                       Subject  Publication Performance
0     Engineering & Technology                     0.15
1             Natural Sciences                     0.08
2     Life Sciences & Medicine                     0.03
3             Art & Humanities                    -0.11
4  Social Science & Management                    -0.15

	Unnamed: 0.1	Unnamed: 0	Entity	Tags	Views Count	Citation Count	Field-Weighted Citation Impact	Scholarly Output	Academic-Corporate Collaboration (%)	Citations per Publication	International Collaboration (%)	Unnamed: 9	subject	field
0	0	0	Center for General Education	NaN	-	-	-	0.0	-	-	-	NaN	Social Science & Management	Accounting & Finance
1	1	1	Department of Accountancy	NaN	1006	68	1.10	25.0	0.0	2.7	36.0	NaN	Social Science & Management	Accounting & Finance
2	2	2	Department of Business Administration	NaN	1302	199	0.62	42.0	0.0	4.7	35.7	NaN	Social Science & Management	Accounting & Finance
3	3	3	Department of Chinese Literature	NaN	-	-	-	0.0	-	-	-	NaN	Social Science & Management	Accounting & Finance
4	4	4	Department of Communication Engineering	NaN	-	-	-	0.0	-	-	-	NaN	Social Science & Management	Accounting & Finance

Publication Performance Analysis in National Taipei University¶

Note: For privacy, the original dataset won't be revealed¶

Table of Contents:¶

Introduction ¶

In this notebook, we want to analyze the publication performance in National Taipei University (NTPU) through using principal component analysis (PCA)¶

Data Manipulation ¶

Correlation Matrix ¶

PCA ¶

Computation of Eigenvector and Eigenvalue¶

Scree Plot and Variance Explained¶

Correlation betweeen Principal Components and Original Variable¶

Result ¶

By department¶

By Subject¶

Conclusion¶

	Views Count	Citation Count	Field-Weighted Citation Impact	Scholarly Output	Academic-Corporate Collaboration (%)	Citations per Publication	International Collaboration (%)
0	1.00	0.94	0.01	0.94	-0.00	0.05	-0.01
1	0.94	1.00	0.06	0.95	0.02	0.13	-0.00
2	0.01	0.06	1.00	-0.01	0.06	0.55	0.36
3	0.94	0.95	-0.01	1.00	0.00	0.02	-0.03
4	-0.00	0.02	0.06	0.00	1.00	0.19	0.02
5	0.05	0.13	0.55	0.02	0.19	1.00	0.31
6	-0.01	-0.00	0.36	-0.03	0.02	0.31	1.00

	Views Count	Citation Count	Field-Weighted Citation Impact	Scholarly Output	Academic-Corporate Collaboration (%)	Citations per Publication	International Collaboration (%)
PC1	0.97	0.98	0.07	0.98	0.03	0.13	0.02
PC2	0.07	0.01	-0.82	0.10	-0.25	-0.82	-0.65
PC3	0.02	0.00	0.14	0.01	-0.94	-0.10	0.31
PC4	-0.03	0.02	0.30	-0.03	-0.22	0.31	-0.69
PC5	-0.01	0.02	-0.46	-0.03	-0.11	0.45	0.05
PC6	-0.21	0.10	-0.00	0.11	0.01	-0.00	-0.01
PC7	0.04	-0.16	0.02	0.14	0.02	-0.00	0.00

Publication Performance Analysis in National Taipei University¶

Note: For privacy, the original dataset won't be revealed¶

Table of Contents:¶

Introduction ¶

In this notebook, we want to analyze the publication performance in National Taipei University (NTPU) through using principal component analysis (PCA)¶

Data Manipulation ¶

Correlation Matrix¶

PCA ¶

Computation of Eigenvector and Eigenvalue¶

Scree Plot and Variance Explained¶

Correlation betweeen Principal Components and Original Variable¶

Result ¶

By department¶

By Subject¶

Conclusion¶

Correlation Matrix ¶