diff --git a/lpassare.ipynb b/lpassare.ipynb new file mode 100644 index 0000000..756de57 --- /dev/null +++ b/lpassare.ipynb @@ -0,0 +1,150 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import sys\n", + "import re\n", + "import pymongo\n", + "import json\n", + "import time\n", + "import datetime\n", + "import requests\n", + "\n", + "dbname = \"fdac18mp2\" #please use this database\n", + "collname = \"glprj_lpassare\" #please modify so you store data in your collection\n", + "# beginning page index\n", + "begin = \"0\"\n", + "client = pymongo.MongoClient()\n", + "\n", + "db = client[dbname]\n", + "coll = db[collname]\n", + "letter=\"b\"\n", + "\n", + "beginurl = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n", + " \"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false\"\n", + "\n", + "\n", + "gleft = 0\n", + "\n", + "header = {'per_page': 99}\n", + "\n", + "# check remaining query chances for rate-limit restriction\n", + "def wait(left):\n", + " global header\n", + " while (left < 20):\n", + " l = requests.get('https://gitlab.com/api/v4/projects', headers=header)\n", + " if (l.ok):\n", + " left = int(l.headers.get('RateLimit-Remaining'))\n", + " time .sleep(60)\n", + " return left\n", + "\n", + "# send queries and extract urls \n", + "def get(url, coll):\n", + "\n", + " global gleft\n", + " global header\n", + " global bginnum\n", + " gleft = wait(gleft)\n", + " values = []\n", + " size = 0\n", + "\n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " time .sleep(0.5)\n", + " # got blocked\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + "\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array = json.loads(t)\n", + " \n", + " for el in array:\n", + " if el['name'].lower().startswith(letter):\n", + " el['site'] = \"git\"\n", + " count += 1\n", + " coll.insert_one(el)\n", + " if count > 49:\n", + " return\n", + " \n", + " \n", + " #next page\n", + " while ('; rel=\"next\"' in lll):\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " gleft = wait(gleft)\n", + " # extract next page url\n", + " ll = lll.replace(';', ',').split(',')\n", + " url = ll[ll.index(' rel=\"next\"') -\n", + " 1].replace('<', '').replace('>', '').lstrip()\n", + " \n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array1 = json.loads(t)\n", + " for el in array1:\n", + " coll.insert(el)\n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return \n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + "\n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return\n", + "\n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + " except Exception as e:\n", + " sys.stderr.write(url + ';' + str(e) + '\\n')\n", + " \n", + "#start retrieving \n", + "get(beginurl,coll)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gitlab \n", + "gl = gitlab.Gitlab('http://10.0.0.1')\n", + "gl.search('projects','b',per_page=50)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}