Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/tools/indexing.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/tools/metadata.doctree
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ <h1>Source code for gen3.tools.indexing.download_manifest</h1><div class="highli
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">asyncio</span>
<span class="kn">import</span> <span class="nn">aiofiles</span>
<span class="kn">import</span> <span class="nn">aiohttp</span>
<span class="kn">import</span> <span class="nn">click</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">time</span>
Expand Down Expand Up @@ -183,18 +184,23 @@ <h1>Source code for gen3.tools.indexing.download_manifest</h1><div class="highli
<span class="n">page_chunks</span> <span class="o">=</span> <span class="p">[]</span>

<span class="c1"># used when an input manifest is provided, this will only read record info for</span>
<span class="c1"># the records referenced in the manifest based on their checksums</span>
<span class="c1"># the records referenced in the manifest based on their checksums or guids</span>
<span class="n">record_chunks</span> <span class="o">=</span> <span class="p">[]</span>

<span class="k">if</span> <span class="n">input_manifest</span><span class="p">:</span>
<span class="c1"># create chunks of checksums</span>
<span class="c1"># create chunks of requested records</span>
<span class="n">logging</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;parsing input file </span><span class="si">{</span><span class="n">input_manifest</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">input_records</span><span class="p">,</span> <span class="n">headers</span> <span class="o">=</span> <span class="n">get_and_verify_fileinfos_from_manifest</span><span class="p">(</span><span class="n">input_manifest</span><span class="p">)</span>

<span class="c1"># allow returning invalid b/c the input manifest won&#39;t always have complete</span>
<span class="c1"># information. e.g. it may not have a checksum b/c only a GUID column is provided</span>
<span class="n">input_records</span><span class="p">,</span> <span class="n">headers</span> <span class="o">=</span> <span class="n">get_and_verify_fileinfos_from_manifest</span><span class="p">(</span>
<span class="n">input_manifest</span><span class="p">,</span> <span class="n">return_invalid_records</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span>
<span class="n">num_input_records</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">input_records</span><span class="p">)</span>

<span class="k">if</span> <span class="ow">not</span> <span class="n">num_input_records</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;No valid records found in provided input file: </span><span class="si">{</span><span class="n">input_manifest</span><span class="si">}</span><span class="s2">. &quot;</span>
<span class="sa">f</span><span class="s2">&quot;No records found in provided input file: </span><span class="si">{</span><span class="n">input_manifest</span><span class="si">}</span><span class="s2">. &quot;</span>
<span class="s2">&quot;Please check previous logs.&quot;</span>
<span class="p">)</span>

Expand Down Expand Up @@ -235,7 +241,7 @@ <h1>Source code for gen3.tools.indexing.download_manifest</h1><div class="highli
<span class="s2">&quot;|||&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">,</span> <span class="n">record_chunks</span><span class="p">[</span><span class="n">x</span><span class="p">]))</span> <span class="k">if</span> <span class="n">record_chunks</span> <span class="k">else</span> <span class="s2">&quot;|||&quot;</span>
<span class="p">)</span>

<span class="c1"># write record_checksum chunks to temporary files since the size can overload</span>
<span class="c1"># write requested records chunks to temporary files since the size can overload</span>
<span class="c1"># command line arguments</span>
<span class="n">input_records_chunk_filename</span> <span class="o">=</span> <span class="n">TMP_FOLDER</span> <span class="o">+</span> <span class="sa">f</span><span class="s2">&quot;input/input_records_chunk_</span><span class="si">{</span><span class="n">x</span><span class="si">}</span><span class="s2">.txt&quot;</span>
<span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
Expand Down Expand Up @@ -435,11 +441,12 @@ <h1>Source code for gen3.tools.indexing.download_manifest</h1><div class="highli

<span class="sd"> Args:</span>
<span class="sd"> commons_url (str): root domain for commons where indexd lives</span>
<span class="sd"> input_record (int/str): indexd record to request (must contain checksum)</span>
<span class="sd"> input_record (int/str): indexd record to request (must contain checksum OR guid)</span>
<span class="sd"> lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http</span>
<span class="sd"> connections</span>
<span class="sd"> queue (asyncio.Queue): queue to put indexd records in</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">guid</span> <span class="o">=</span> <span class="n">input_record</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">GUID_STANDARD_KEY</span><span class="p">)</span>
<span class="n">checksum</span> <span class="o">=</span> <span class="n">input_record</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">MD5_STANDARD_KEY</span><span class="p">)</span>

<span class="n">index</span> <span class="o">=</span> <span class="n">Gen3Index</span><span class="p">(</span><span class="n">commons_url</span><span class="p">)</span>
Expand All @@ -449,9 +456,20 @@ <h1>Source code for gen3.tools.indexing.download_manifest</h1><div class="highli
<span class="k">if</span> <span class="s2">&quot;https&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">commons_url</span><span class="p">:</span>
<span class="n">ssl</span> <span class="o">=</span> <span class="kc">False</span>

<span class="n">records</span> <span class="o">=</span> <span class="k">await</span> <span class="n">index</span><span class="o">.</span><span class="n">async_get_records_from_checksum</span><span class="p">(</span>
<span class="n">checksum</span><span class="o">=</span><span class="n">checksum</span><span class="p">,</span> <span class="n">_ssl</span><span class="o">=</span><span class="n">ssl</span>
<span class="p">)</span>
<span class="n">records</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="n">guid</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">matched_record</span> <span class="o">=</span> <span class="k">await</span> <span class="n">index</span><span class="o">.</span><span class="n">async_get_record</span><span class="p">(</span><span class="n">guid</span><span class="o">=</span><span class="n">guid</span><span class="p">,</span> <span class="n">_ssl</span><span class="o">=</span><span class="n">ssl</span><span class="p">)</span>
<span class="n">records</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">matched_record</span><span class="p">)</span>
<span class="k">except</span> <span class="n">aiohttp</span><span class="o">.</span><span class="n">client_exceptions</span><span class="o">.</span><span class="n">ClientResponseError</span> <span class="k">as</span> <span class="n">exc</span><span class="p">:</span>
<span class="c1"># this means the GUID likely doesn&#39;t exist or there was an error</span>
<span class="c1"># receiving it. In this case, log and ignore</span>
<span class="n">logging</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;guid: </span><span class="si">{</span><span class="n">guid</span><span class="si">}</span><span class="s2"> not found. Error: </span><span class="si">{</span><span class="n">exc</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">matched_records</span> <span class="o">=</span> <span class="k">await</span> <span class="n">index</span><span class="o">.</span><span class="n">async_get_records_from_checksum</span><span class="p">(</span>
<span class="n">checksum</span><span class="o">=</span><span class="n">checksum</span><span class="p">,</span> <span class="n">_ssl</span><span class="o">=</span><span class="n">ssl</span>
<span class="p">)</span>
<span class="n">records</span> <span class="o">+=</span> <span class="n">matched_records</span>

<span class="c1"># if nothing was found, we still want to output the input record</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">records</span><span class="p">:</span>
Expand Down
2 changes: 1 addition & 1 deletion docs/_build/html/searchindex.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/_build/html/tools/indexing.html
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ <h1>Indexing Tools<a class="headerlink" href="#indexing-tools" title="Link to th

<dl class="py function">
<dt class="sig sig-object py" id="gen3.tools.indexing.verify_manifest.async_verify_object_manifest">
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1701900364.6706986.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Link to this definition">¶</a></dt>
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1701965194.2907536.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Link to this definition">¶</a></dt>
<dd><p>Verify all file object records into a manifest csv</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
Expand Down
2 changes: 1 addition & 1 deletion docs/_build/html/tools/metadata.html
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ <h1>Metadata Tools<a class="headerlink" href="#metadata-tools" title="Link to th

<dl class="py function">
<dt class="sig sig-object py" id="gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest">
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1701900364.9645383.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Link to this definition">¶</a></dt>
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1701965194.6133904.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Link to this definition">¶</a></dt>
<dd><p>Ingest all metadata records into a manifest csv</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
Expand Down
2 changes: 1 addition & 1 deletion gen3/cli/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def crosswalk():
"--input-manifest",
"input_manifest",
help="Input file. Read available object data only for records referenced in this file. "
"Currently requires at a minimum an `m5d` column with checksum.",
"Currently requires at a minimum an `m5d` OR `guid` column to be populated.",
default=None,
type=click.Path(writable=True),
show_default=True,
Expand Down
Loading