Getting started

Introductory notebook for getting started with ExaMon and the Monte Cimone RISC-V cluster.

Prerequisites¶

Ability to connect with an account (ssh) to MonteCimone
Web browser

To access the Grafana instance via the browser¶

On your laptop/workstation, create a tunnel with your MC user using the following command:

ssh -L 3000:localhost:3000 -L 5000:localhost:5000 -p 2223 <your_mc_username>@137.204.56.52

Open your web browser and go to the following page:
- http://localhost:3000/
Enter the following credentials to access the dashboard:
```
User: ext_student
Password: ext_student
```
Once logged in, you will be in the HOME page. From there, you can open the example dashboard by visiting the following link:
- http://localhost:3000/d/PaU3WSt7z/montecimone-overview?orgId=1

To access the same data via script/notebook¶

Prerequisites:
- In addition to the previous prerequisites, the ability to run a jupyter server (py3) on your laptop/workstation
On your laptop, start:
- a tunnel as in the previous step
- a python 3 jupyter server.
To access the db, the examon-client is required
- it is installed directly in the notebook by executing, once only, in a cell:
```
- ! pip install https://github.com/fbeneventi/releases/releases/latest/download/examon-client.zip
```

In [1]:

Copied!





%matplotlib inline

# ssh -L 3000:192.168.1.201:3000 -L 5000:192.168.1.201:5000 -p 2223 <mc_username>@137.204.56.52


import os
import numpy as np

import pandas as pd
from examon.examon import Client, ExamonQL

# Connect
USER = 'ext_student'
PWD = 'ext_student'
ex = Client('127.0.0.1', port='3000', user=USER, password=PWD, verbose=False, proxy=True)
sq = ExamonQL(ex)
%matplotlib inline

# ssh -L 3000:192.168.1.201:3000 -L 5000:192.168.1.201:5000 -p 2223 @137.204.56.52


import os
import numpy as np

import pandas as pd
from examon.examon import Client, ExamonQL

# Connect
USER = 'ext_student'
PWD = 'ext_student'
ex = Client('127.0.0.1', port='3000', user=USER, password=PWD, verbose=False, proxy=True)
sq = ExamonQL(ex)

In [3]:

Copied!

pd.DataFrame(sq.metric_list)
pd.DataFrame(sq.metric_list)

Out[3]:

	name
0	CYCLES
1	INSTRUCTIONS
2	dsk_total.read
3	dsk_total.writ
4	io_total.read
5	io_total.writ
6	load_avg.15m
7	load_avg.1m
8	load_avg.5m
9	memory_usage.buff
10	memory_usage.cach
11	memory_usage.free
12	memory_usage.used
13	net_total.recv
14	net_total.send
15	paging.in
16	paging.out
17	procs.blk
18	procs.new
19	procs.run
20	system.csw
21	system.int
22	temperature.average
23	temperature.cpu_temp
24	temperature.mb_temp
25	temperature.nvme_temp
26	temperature.total
27	total_cpu_usage.idl
28	total_cpu_usage.stl
29	total_cpu_usage.sys
30	total_cpu_usage.usr
31	total_cpu_usage.wai

In [5]:

Copied!

df = sq.DESCRIBE(metric='INSTRUCTIONS') \
    .execute()
    
df
df = sq.DESCRIBE(metric='INSTRUCTIONS') \
    .execute()
    
df

Out[5]:

	name	tag key	tag values
0	INSTRUCTIONS	node	[mcimone-node-1, mcimone-node-2, mcimone-node-...
1	INSTRUCTIONS	core	[0, 1, 2, 3]
2	INSTRUCTIONS	plugin	[pmu_pub]
3	INSTRUCTIONS	chnl	[data]
4	INSTRUCTIONS	cluster	[hifive]
5	INSTRUCTIONS	org	[unibo]

In [22]:

Copied!





data = sq.SELECT('node','cluster','core') \
    .FROM('INSTRUCTIONS') \
    .TSTART(30, 'minutes') \
    .execute()
    
data.df_table.head(10)
data = sq.SELECT('node','cluster','core') \
    .FROM('INSTRUCTIONS') \
    .TSTART(30, 'minutes') \
    .execute()
    
data.df_table.head(10)

Out[22]:

	cluster	name	node	timestamp	value
0	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:52+02:00	1.794506e+11
1	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:52.500000+02:00	1.794748e+11
2	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:53+02:00	1.794753e+11
3	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:53.500000+02:00	1.794758e+11
4	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:54+02:00	1.794764e+11
5	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:54.500000+02:00	1.794769e+11
6	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:55+02:00	1.794775e+11
7	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:55.500000+02:00	1.794780e+11
8	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:56+02:00	1.794786e+11
9	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-27 19:20:56.500000+02:00	1.794791e+11

In [24]:

Copied!

data.to_series(flat_index=True, interp='time', dropna=True).df_ts.plot(figsize=[15,30], subplots=True);
data.to_series(flat_index=True, interp='time', dropna=True).df_ts.plot(figsize=[15,30], subplots=True);

No description has been provided for this image

In [36]:

Copied!





import json

# Setup 
sq.jc.JOB_TABLES.extend(['job_info_hifive'])

data = sq.SELECT('name','user_id','job_id','job_state','start_time','end_time','nodes','num_nodes','num_cpus','work_dir') \
    .FROM('job_info_hifive') \
    .WHERE(node='mcimone-node-1') \
    .TSTART('27-06-2023 08:09:00') \
    .TSTOP('28-06-2023 23:09:00') \
    .execute()  

df = pd.DataFrame(json.loads(data))
df.head(50)
import json

# Setup 
sq.jc.JOB_TABLES.extend(['job_info_hifive'])

data = sq.SELECT('name','user_id','job_id','job_state','start_time','end_time','nodes','num_nodes','num_cpus','work_dir') \
    .FROM('job_info_hifive') \
    .WHERE(node='mcimone-node-1') \
    .TSTART('27-06-2023 08:09:00') \
    .TSTOP('28-06-2023 23:09:00') \
    .execute()  

df = pd.DataFrame(json.loads(data))
df.head(50)

Out[36]:

	end_time	job_id	job_state	name	nodes	num_cpus	num_nodes	start_time	user_id	work_dir
0	2023-06-27T18:18:13.000Z	2825	CANCELLED	bash	mcimone-node-1	1	1	2023-06-27T18:00:08.000Z	2001	/home/abartolini/HPL/src/hpl-2.3
1	2023-06-28T11:11:47.000Z	2868	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:11:47.000Z	6010	/home/userdeiphd10
2	2023-06-27T14:48:34.000Z	2821	FAILED	hpl	mcimone-node-1	2	1	2023-06-27T14:48:32.000Z	2001	/home/abartolini
3	2023-06-28T11:15:55.000Z	2872	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:15:53.000Z	6008	/home/userdeiphd08
4	2023-06-28T07:05:00.000Z	2860	COMPLETED	bash	mcimone-node-1	1	1	2023-06-28T07:04:54.000Z	6001	/home/userdeiphd01
5	2023-06-27T18:19:49.000Z	2831	FAILED	hpl	mcimone-node-1	2	1	2023-06-27T18:19:46.000Z	2001	/home/abartolini
6	2023-06-28T11:32:38.000Z	2887	COMPLETED	sleep	mcimone-node-1	1	1	2023-06-28T11:32:08.000Z	6005	/home/userdeiphd05
7	2023-06-28T11:27:32.000Z	2877	COMPLETED	bash	mcimone-node-1	1	1	2023-06-28T11:27:26.000Z	6008	/home/userdeiphd08/02_ex
8	2023-06-28T11:19:44.000Z	2874	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:19:33.000Z	6010	/home/userdeiphd10
9	2023-06-27T21:11:24.000Z	2845	COMPLETED	stream	mcimone-node-1	4	1	2023-06-27T21:11:05.000Z	2001	/home/abartolini
10	2023-06-28T07:34:06.000Z	2864	FAILED	hpl	mcimone-node-1	2	1	2023-06-28T07:34:03.000Z	2001	/home/abartolini
11	2023-06-28T11:31:14.000Z	2882	COMPLETED	sleep	mcimone-node-1	1	1	2023-06-28T11:31:03.000Z	6011	/home/userdeiphd11
12	2023-06-28T11:13:29.000Z	2869	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:13:28.000Z	6008	/home/userdeiphd08
13	2023-06-28T11:09:07.000Z	2867	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:09:06.000Z	6008	/home/userdeiphd08
14	2023-06-28T11:30:23.000Z	2878	COMPLETED	07smc.sh	mcimone-node-1	1	1	2023-06-28T11:30:22.000Z	6007	/home/userdeiphd07
15	2023-06-28T11:36:17.000Z	2901	COMPLETED	test.sh	mcimone-node-1	1	1	2023-06-28T11:36:16.000Z	6001	/home/userdeiphd01
16	2023-06-28T11:38:31.000Z	2903	COMPLETED	07j.sh	mcimone-node-1	1	1	2023-06-28T11:38:29.000Z	6007	/home/userdeiphd07
17	2023-06-28T11:33:56.000Z	2893	COMPLETED	echo	mcimone-node-1	1	1	2023-06-28T11:33:56.000Z	6011	/home/userdeiphd11
18	2023-06-27T22:03:38.000Z	2855	COMPLETED	hpl	mcimone-node-1	4	1	2023-06-27T22:03:33.000Z	2001	/home/abartolini
19	2023-06-28T11:33:42.000Z	2892	COMPLETED	echo	mcimone-node-1	1	1	2023-06-28T11:33:41.000Z	6003	/home/userdeiphd03
20	2023-06-28T11:33:41.000Z	2890	COMPLETED	echo	mcimone-node-1	1	1	2023-06-28T11:33:40.000Z	6003	/home/userdeiphd03
21	2023-06-28T11:32:08.000Z	2885	COMPLETED	echo	mcimone-node-1	1	1	2023-06-28T11:32:07.000Z	6005	/home/userdeiphd05
22	2023-06-28T11:34:27.000Z	2895	COMPLETED	sleep	mcimone-node-1	1	1	2023-06-28T11:33:57.000Z	6011	/home/userdeiphd11
23	2023-06-28T07:30:40.000Z	2861	COMPLETED	bash	mcimone-node-1	1	1	2023-06-28T07:05:12.000Z	2001	/home/abartolini
24	2023-06-28T08:44:09.000Z	2866	CANCELLED	hpl	mcimone-node-1	4	1	2023-06-28T07:38:27.000Z	2001	/home/abartolini
25	2023-06-27T21:41:14.000Z	2854	COMPLETED	stream	mcimone-node-1	1	1	2023-06-27T21:40:57.000Z	2001	/home/abartolini
26	2023-06-28T07:36:11.000Z	2865	COMPLETED	hpl	mcimone-node-1	4	1	2023-06-28T07:36:06.000Z	2001	/home/abartolini
27	2023-06-28T11:31:03.000Z	2880	COMPLETED	echo	mcimone-node-1	1	1	2023-06-28T11:31:02.000Z	6011	/home/userdeiphd11
28	2023-06-28T11:30:49.000Z	2879	FAILED	bash	mcimone-node-1	1	1	2023-06-28T11:30:23.000Z	6008	/home/userdeiphd08/02_ex
29	2023-06-28T07:04:37.000Z	2859	COMPLETED	bash	mcimone-node-1	1	1	2023-06-28T07:04:26.000Z	2001	/home/abartolini
30	2023-06-27T14:48:11.000Z	2820	FAILED	run_hpl.sh	mcimone-node-1	1	1	2023-06-27T14:48:11.000Z	2001	/home/abartolini
31	2023-06-27T21:38:29.000Z	2852	COMPLETED	stream	mcimone-node-1	1	1	2023-06-27T21:38:11.000Z	2001	/home/abartolini
32	2023-06-28T11:19:09.000Z	2873	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:19:07.000Z	6010	/home/userdeiphd10
33	2023-06-27T21:13:04.000Z	2846	COMPLETED	stream	mcimone-node-1	4	1	2023-06-27T21:12:47.000Z	2001	/home/abartolini
34	2023-06-27T21:18:39.000Z	2851	COMPLETED	stream	mcimone-node-1	1	1	2023-06-27T21:18:22.000Z	2001	/home/abartolini
35	2023-06-28T11:44:48.000Z	2905	FAILED	Test1	mcimone-node-1	1	1	2023-06-28T11:44:47.000Z	6004	/home/userdeiphd04
36	2023-06-28T11:14:08.000Z	2871	COMPLETED	test	mcimone-node-1	4	1	2023-06-28T11:14:07.000Z	6008	/home/userdeiphd08
37	2023-06-27T14:50:37.000Z	2822	FAILED	hpl	mcimone-node-1	2	1	2023-06-27T14:50:34.000Z	2001	/home/abartolini

In [41]:

Copied!





import json


data = sq.SELECT('name','user_id','job_id','job_state','start_time','end_time','nodes','num_nodes','num_cpus','work_dir') \
    .FROM('job_info_hifive') \
    .WHERE(job_id='2866') \
    .TSTART('27-06-2023 08:09:00') \
    .execute()  

df = pd.DataFrame(json.loads(data))
df.head()
import json


data = sq.SELECT('name','user_id','job_id','job_state','start_time','end_time','nodes','num_nodes','num_cpus','work_dir') \
    .FROM('job_info_hifive') \
    .WHERE(job_id='2866') \
    .TSTART('27-06-2023 08:09:00') \
    .execute()  

df = pd.DataFrame(json.loads(data))
df.head()

Out[41]:

	end_time	job_id	job_state	name	nodes	num_cpus	num_nodes	start_time	user_id	work_dir
0	2023-06-28T08:44:09.000Z	2866	CANCELLED	hpl	mcimone-node-1	4	1	2023-06-28T07:38:27.000Z	2001	/home/abartolini

In [69]:

Copied!





data = sq.SELECT('node','cluster','core') \
    .FROM('INSTRUCTIONS') \
    .WHERE(node='mcimone-node-1') \
    .TSTART('28-06-2023 07:38:27') \
    .TSTOP('28-06-2023 08:44:09') \
    .execute()
    
data.df_table.head(10)
data = sq.SELECT('node','cluster','core') \
    .FROM('INSTRUCTIONS') \
    .WHERE(node='mcimone-node-1') \
    .TSTART('28-06-2023 07:38:27') \
    .TSTOP('28-06-2023 08:44:09') \
    .execute()
    
data.df_table.head(10)

Out[69]:

	cluster	name	node	timestamp	value
0	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:27+02:00	3.100586e+11
1	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:27.500000+02:00	3.100629e+11
2	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:28+02:00	3.100923e+11
3	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:28.500000+02:00	3.100927e+11
4	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:29+02:00	3.100957e+11
5	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:29.500000+02:00	3.101099e+11
6	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:30+02:00	3.101479e+11
7	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:30.500000+02:00	3.103973e+11
8	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:31+02:00	3.108438e+11
9	hifive	INSTRUCTIONS	mcimone-node-1	2023-06-28 07:38:31.500000+02:00	3.113999e+11

In [46]:

Copied!

data.df_table.info()
data.df_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252320 entries, 0 to 252319
Data columns (total 6 columns):
cluster      252320 non-null object
core         252320 non-null object
name         252320 non-null object
node         252320 non-null object
timestamp    252320 non-null datetime64[ns, Europe/Rome]
value        252320 non-null float64
dtypes: datetime64[ns, Europe/Rome](1), float64(1), object(4)
memory usage: 11.6+ MB

In [70]:

Copied!





df = data.df_table

# Sort the DataFrame by 'timestamp'
df = df.sort_values('timestamp')

# Calculate the time difference between consecutive rows for each core and node
df['time_diff'] = df.groupby(['core', 'node'])['timestamp'].diff()

# Calculate the Instructions per second for each core and node
df['instructions_per_second'] = df.groupby(['core', 'node'])['value'].diff() / df['time_diff'].dt.total_seconds()

# Drop rows with NaN values (first row for each core and node)
df = df.dropna()

# Print the resulting DataFrame
print(df[['cluster', 'core', 'node', 'instructions_per_second']])
df = data.df_table

# Sort the DataFrame by 'timestamp'
df = df.sort_values('timestamp')

# Calculate the time difference between consecutive rows for each core and node
df['time_diff'] = df.groupby(['core', 'node'])['timestamp'].diff()

# Calculate the Instructions per second for each core and node
df['instructions_per_second'] = df.groupby(['core', 'node'])['value'].diff() / df['time_diff'].dt.total_seconds()

# Drop rows with NaN values (first row for each core and node)
df = df.dropna()

# Print the resulting DataFrame
print(df[['cluster', 'core', 'node', 'instructions_per_second']])

      cluster core            node  instructions_per_second
23656  hifive    3  mcimone-node-1               38523114.0
1      hifive    0  mcimone-node-1                8620240.0
7886   hifive    1  mcimone-node-1               14623376.0
15771  hifive    2  mcimone-node-1               14995134.0
7887   hifive    1  mcimone-node-1               83411746.0
23657  hifive    3  mcimone-node-1               11415616.0
2      hifive    0  mcimone-node-1               58787754.0
15772  hifive    2  mcimone-node-1               14676376.0
23658  hifive    3  mcimone-node-1                1645138.0
7888   hifive    1  mcimone-node-1                 350340.0
3      hifive    0  mcimone-node-1                 803444.0
15773  hifive    2  mcimone-node-1                1136978.0
23659  hifive    3  mcimone-node-1                2783320.0
7889   hifive    1  mcimone-node-1                5273074.0
15774  hifive    2  mcimone-node-1                2275020.0
4      hifive    0  mcimone-node-1                6114222.0
23660  hifive    3  mcimone-node-1              376575758.0
5      hifive    0  mcimone-node-1               28421320.0
15775  hifive    2  mcimone-node-1               21301446.0
7890   hifive    1  mcimone-node-1              140078054.0
15776  hifive    2  mcimone-node-1               11004830.0
6      hifive    0  mcimone-node-1               76035596.0
23661  hifive    3  mcimone-node-1              188484324.0
7891   hifive    1  mcimone-node-1              271213162.0
7892   hifive    1  mcimone-node-1              475208026.0
15777  hifive    2  mcimone-node-1              483062156.0
23662  hifive    3  mcimone-node-1              486469840.0
7      hifive    0  mcimone-node-1              498794364.0
7893   hifive    1  mcimone-node-1              885180146.0
23663  hifive    3  mcimone-node-1              894075140.0
...       ...  ...             ...                      ...
23647  hifive    2  mcimone-node-1              233339268.0
15762  hifive    1  mcimone-node-1              228270452.0
23648  hifive    2  mcimone-node-1              217027384.0
7878   hifive    0  mcimone-node-1              213077746.0
31533  hifive    3  mcimone-node-1              218002042.0
15763  hifive    1  mcimone-node-1              221205918.0
15764  hifive    1  mcimone-node-1              221965008.0
31534  hifive    3  mcimone-node-1              233791354.0
23649  hifive    2  mcimone-node-1              223936646.0
7879   hifive    0  mcimone-node-1              201658966.0
31535  hifive    3  mcimone-node-1              209675462.0
23650  hifive    2  mcimone-node-1              217535866.0
15765  hifive    1  mcimone-node-1              221215852.0
7880   hifive    0  mcimone-node-1              209611952.0
23651  hifive    2  mcimone-node-1              266259612.0
15766  hifive    1  mcimone-node-1              241839476.0
7881   hifive    0  mcimone-node-1              251117610.0
31536  hifive    3  mcimone-node-1              233599118.0
23652  hifive    2  mcimone-node-1              226341798.0
31537  hifive    3  mcimone-node-1              224304384.0
7882   hifive    0  mcimone-node-1              207978252.0
15767  hifive    1  mcimone-node-1              224403098.0
23653  hifive    2  mcimone-node-1              223050652.0
31538  hifive    3  mcimone-node-1              223850414.0
15768  hifive    1  mcimone-node-1              219587870.0
7883   hifive    0  mcimone-node-1              223695594.0
15769  hifive    1  mcimone-node-1              225851464.0
7884   hifive    0  mcimone-node-1              207533258.0
23654  hifive    2  mcimone-node-1              220584236.0
31539  hifive    3  mcimone-node-1              224012882.0

[31536 rows x 4 columns]

In [71]:

Copied!

df[['timestamp','node','core','instructions_per_second']]\
.pivot_table(index='timestamp', columns=['node','core'], dropna=True, aggfunc='first')\
.plot(figsize=[15,30], subplots=True);
df[['timestamp','node','core','instructions_per_second']]\
.pivot_table(index='timestamp', columns=['node','core'], dropna=True, aggfunc='first')\
.plot(figsize=[15,30], subplots=True);