This jupyter notebook aims to read the data from Google Scholar csv poll into a panda dataframe to perform basic statistics about proteomics software useage by 2019.
The original poll is not close and you can vote here
The results are collected in this Github repo
import pandas as pd
#import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.plotly as py
init_notebook_mode(connected=True)
# Read the poll data
data_df = pd.read_csv("../data/poll-results-07032019.csv")
data_citations = pd.read_csv("../data/citations-count.csv")
data_df.head()
def get_pie_chart(collection):
"""
This function returns a pie chart from a collection of elements.
@param collection must be a column from a pandas datframe
"""
collection_counts = {}
for word in collection:
token_collection = str(word).split(";")
for token in token_collection:
if token.strip() != 'nan':
if token not in collection_counts:
collection_counts[token] = 0
collection_counts[token] = collection_counts[token] + 1
trace=go.Pie(labels=[k for k in collection_counts.keys()],values=[k for k in collection_counts.values()])
return trace
def get_multioption_bar_data(data_df, question):
"""
This function get a panda dataframe and an a multioption question and generate a barplot
@param data Pandas data frame
@param question Question (column name prefix)
"""
column_names = [col for col in data_df.columns if question in col]
values = []
for column in column_names:
value = column[column.find("[") + 1: column.find("]")]
values.append(value)
plot_values = {}
for index, row in data_df.iterrows():
for column in column_names:
column_value = column[column.find("[") + 1: column.find("]")]
complete_value = data_df.at[index,column]
combined_values = str(complete_value).split(";")
for value in combined_values:
if value not in values:
if value in plot_values:
if column_value in plot_values[value]:
count = plot_values[value][column_value] + 1
plot_values[value].update({column_value:count})
elif ((str(value)).lower() != 'nan'):
plot_values[value].update({column_value:1})
elif((str(value)).lower() != 'nan'):
plot_values[value] = {column_value: 1}
data = []
for option in plot_values:
trace = go.Bar(x = list(plot_values[option].keys()), y = list(plot_values[option].values()),name = option)
data.append(trace)
return data
def get_column_distribution(current_data, question):
"""
This function provides the distribution for one column question.
"""
downstream_cols = [col for col in current_data.columns if question in col]
downstream_data = current_data[downstream_cols]
downstream_dic = {}
for index, row in downstream_data.iterrows():
for column in downstream_cols:
sentence = downstream_data.at[index, column]
if(str(sentence).lower() != 'nan'):
values = sentence.split(";")
for value in values:
if value.strip() not in downstream_dic:
downstream_dic[value.strip()] = 0
downstream_dic[value.strip()] = downstream_dic[value.strip()] + 1
data = [go.Bar(
x=list(downstream_dic.values()),
y=list(downstream_dic.keys()),
orientation = 'h'
)]
return data
Read the position and occupation of every participant of the poll.
positions = data_df.iloc[:,2]
trace = get_pie_chart(positions)
iplot([trace])
The number of participants in the poll by type of Institution.
affiliations = data_df.iloc[:,3]
trace = get_pie_chart(affiliations)
iplot([trace])
These plots will show the usage of Quantification software.
question = "Which of these software do you use for your Quant Analysis"
data = get_multioption_bar_data(data_df, question)
layout = go.Layout(barmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')
These plots will show the usage of Quantification software (Commercial software).
question = "Which commercial software do you use for your Quant Analysis"
data = get_multioption_bar_data(data_df, question)
layout = go.Layout(barmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')
The purpose of this question is to know, whta kind of features the users of Computational proteomics software consider more relevant.
question = "Which are the features that make you to chose the following software for your data analysis"
data = get_multioption_bar_data(data_df, question)
layout = go.Layout(barmode='group',legend=dict(x=1, y=1.0, font=dict(
family='sans-serif',
size=10,
color='#000'
),))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')
This section shows the software/tools and packages (Python and R) for downstream analysis.
#Â We use a filter here because of special characters
question_pattern = "Which packages do you use for downstream analysis"
data = get_column_distribution(data_df, question_pattern)
iplot(data, filename='horizontal-bar')
This part of the poll is about the usage of Peptide/protein search engines.
question="Which Search engine free do you use"
data = get_column_distribution(data_df, question)
iplot(data, filename='horizontal-bar')
This questions aims to detect what type of analysis is performed with each Search Engine.
question = "Search engine and type of analysis"
data = get_multioption_bar_data(data_df, question)
layout = go.Layout(barmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')
This plot aims to compute the number of citations (Google Scholar) per tool. For the present statistics we added all the main publications related with the tool.
note: Please review the file (software-ids.csv) to included all the publications and check if something is missing. You can also add an issue in the repository.
data = [go.Bar(
x=list(data_citations['Citations']),
y=list(data_citations['Software']),
orientation = 'h'
)]
iplot(data, filename='horizontal-bar')
note: In this case the number of citations has been obtained from the web of science. This is the main reason why the final
numbers between both sections do not match.
## Note, this code hasn't been implemented yet nativelly in the jupyter notebook. At the moment this figure
## is generated with an external script in the same repository.
from IPython.display import Image
Image("citations_rank_delta.png")
from IPython.display import Image
Image("citations_cumulative.png")