This lists the datasets from trusted users that have ‘bettr’ & ‘performance’ tags, including:
NB: should work anywhere (doesn’t need to be inside renku) provided the projects are public
# comma-separated, quoted list of known emails addresses
trusted_users='"izaskun.mallona@gmail.com","taupo@robinsonlab.uzh.ch"'
curl -s "https://renkulab.io/knowledge-graph/datasets?query=bettr" \
| jq -r '.[] | select(
.keywords[] == "bettr" and
.keywords[] == "performance" and
.published.creator[].email == ('$trusted_users')
) | "\"\(.name)\",\"\(.identifier)\",\"\(.date)\""' > bettr_datasets.csv
cat bettr_datasets.csv
## "oecd_dummy_perf","61995ab5-92ca-4716-915a-ee39919c8f7b","2021-06-14T08:26:04.676952Z"
## "soneson_scde_2018_perf","e68365d7-88b8-4a71-ae92-17d9dba45c75","2021-06-14T08:27:32.512978Z"
renkuKG=https://renkulab.io/knowledge-graph
renkuGL=https://renkulab.io/gitlab
echo "name,date,performances,metric_characteristics,perf_ok,metchar_ok" > perf_files.csv
for dataset in `cat bettr_datasets.csv`; do
did=`echo $dataset | cut -d, -f 2 | sed 's/\"//g'`
dname=`echo $dataset | cut -d, -f 1`
ddate=`echo $dataset | cut -d, -f 3`
# get the project's path
proj_path=`curl -s $renkuKG/datasets/$did | jq '.project.path'`
# fetch the location of the performance file
perf=`curl -s "$renkuKG/datasets/$did" | jq '.hasPart[]' \
| jq 'select(.name == "performances.csv").atLocation'`
# if there's no performance file, jump to next iteration
if [ -z "$perf" ]; then continue; fi
# get the url to the performance file in gitlab (is that really good?)
perf_file=`echo $renkuGL/$proj_path/-/raw/master/$perf | sed 's/\"//g'`
# check that the file isn't empty and is a readable data.frame
check_perf=`Rscript --vanilla -e '
cat(!is(x <- try(read.csv(url("'$perf_file'")), silent=TRUE), "try-error") &&
nrow(x) > 0 && ncol(x) > 0 && any(sapply(x,is.numeric)))
'`
# fetch the metric information file
metinfo=`curl -s "$renkuKG/datasets/$did" | jq '.hasPart[]' \
| jq 'select(.name == "method_characteristics.tsv").atLocation'`
if [ -z "$metinfo" ]; then
metinfo_file=""
check_info=FALSE
else
# get full URL:
metinfo_file=`echo $renkuGL/$proj_path/-/raw/master/$metinfo | sed 's/\"//g'`
# check that the file isn't empty and is a readable data.frame
check_info=`Rscript --vanilla -e '
cat(!is(x <- try(read.csv(url("'$metinfo_file'")), silent=TRUE), "try-error") &&
nrow(x) > 0 && ncol(x) > 0)
'`
fi
echo $dname,$ddate,$perf_file,$metinfo_file,$check_perf,$check_info >> perf_files.csv
done
#column -t -s, perf_files.csv
read.csv("perf_files.csv")
## name date
## 1 soneson_scde_2018_perf 2021-06-14T08:27:32.512978Z
## performances
## 1 https://renkulab.io/gitlab/bettr_hackathon/perf-data/-/raw/master/soneson_scde_2018/performances.csv
## metric_characteristics
## 1 https://renkulab.io/gitlab/bettr_hackathon/perf-data/-/raw/master/soneson_scde_2018/method_characteristics.tsv
## perf_ok metchar_ok
## 1 TRUE TRUE