1+ <?php
2+
3+ namespace Swader \Diffbot \Entity ;
4+
5+ use Swader \Diffbot \Abstracts \Entity ;
6+
7+ class Article extends Entity
8+ {
9+ /**
10+ * Should always return "article"
11+ * @return string
12+ */
13+ public function getType ()
14+ {
15+ return $ this ->objects ['type ' ];
16+ }
17+
18+ /**
19+ * Returns the URL which was crawled
20+ * @return string
21+ */
22+ public function getPageUrl ()
23+ {
24+ return $ this ->objects ['pageUrl ' ];
25+ }
26+
27+ /**
28+ * Returns page Url which was resolved by redirects, if any.
29+ * For example, crawling a bitly link will make this method return the ultimate destination's URL
30+ * @return string
31+ */
32+ public function getResolvedPageUrl ()
33+ {
34+ return (isset ($ this ->objects ['resolvedPageUrl ' ])) ? $ this ->objects ['resolvedPageUrl ' ] : $ this ->getPageUrl ();
35+ }
36+
37+ /**
38+ * Returns title of article as deducted by Diffbot
39+ * @return string
40+ */
41+ public function getTitle ()
42+ {
43+ return $ this ->objects ['title ' ];
44+ }
45+
46+ /**
47+ * Returns plaintext version of article (no HTML) as parsed by Diffbot.
48+ * Only the content is returned, the text in the surrounding (layout etc) elements is ignored.
49+ * @return string
50+ */
51+ public function getText ()
52+ {
53+ return $ this ->objects ['text ' ];
54+ }
55+
56+ /**
57+ * Returns full HTML of the article's content - only the content, not the surrounding layout HTML.
58+ * @return string
59+ */
60+ public function getHtml ()
61+ {
62+ return $ this ->objects ['html ' ];
63+ }
64+
65+ /**
66+ * Returns date as per http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3
67+ * Example date: "Wed, 18 Dec 2013 00:00:00 GMT"
68+ * Note that this is "strtotime" friendly for further conversions
69+ * @todo add more formats as method arguments
70+ * @return string
71+ */
72+ public function getDate ()
73+ {
74+ return $ this ->objects ['date ' ];
75+ }
76+
77+ /**
78+ * Returns the full name of the author, as signed on the article's page
79+ * @return string
80+ */
81+ public function getAuthor ()
82+ {
83+ return $ this ->objects ['author ' ];
84+ }
85+
86+ /**
87+ * The array returned will contain all tags that Diffbot's AI concluded match the content
88+ *
89+ * Note that these are *not* the meta tags as defined by the author, but machine learned ones.
90+ * Note also that tags may differ depending on URL. Visiting a bitly link vs visiting a fully resolved one
91+ * will sometimes yield different results. It is currently unknown why this happens.
92+ * The format of the array is:
93+ *
94+ * [
95+ * [
96+ * "id": 133907,
97+ * "count": 3,
98+ * "prevalence": 0.3103448275862069,
99+ * "label": "Apache HTTP Server",
100+ * "type": "Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity",
101+ * "uri": "http://dbpedia.org/resource/Apache_HTTP_Server"
102+ * ],
103+ * [
104+ * "id": 208652,
105+ * "count": 5,
106+ * "prevalence": 0.5172413793103449,
107+ * "label": "PHP",
108+ * "type": "Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity",
109+ * "uri": "http://dbpedia.org/resource/PHP"
110+ * ]
111+ * ]
112+ *
113+ * @return array
114+ */
115+ public function getTags ()
116+ {
117+ return $ this ->objects ['tags ' ];
118+ }
119+
120+ /**
121+ * Alias for getLang()
122+ * @see getLang()
123+ * @return string
124+ */
125+ public function getHumanLanguage ()
126+ {
127+ return $ this ->getLang ();
128+ }
129+
130+ /**
131+ * Returns the human language as determined by Diffbot when looking at content.
132+ * The code returned is a two-character ISO 639-1 code: http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
133+ * @return string
134+ */
135+ public function getLang ()
136+ {
137+ return $ this ->objects ['humanLanguage ' ];
138+ }
139+
140+ /**
141+ * Number of pages automatically concatenated to form the text or html response.
142+ * By default, Diffbot will automatically concatenate up to 20 pages of an article.
143+ * @see http://support.diffbot.com/automatic-apis/handling-multiple-page-articles/
144+ * @return int
145+ */
146+ public function getNumPages ()
147+ {
148+ return (isset ($ this ->objects ['numPages ' ])) ? $ this ->objects ['numPages ' ] : 1 ;
149+ }
150+
151+ /**
152+ * Array of all page URLs concatenated in a multipage article.
153+ * Empty array if article was not concatenated before being returned.
154+ * @see http://support.diffbot.com/automatic-apis/handling-multiple-page-articles/
155+ * @return array
156+ */
157+ public function getNextPages ()
158+ {
159+ return (isset ($ this ->objects ['nextPages ' ])) ? $ this ->objects ['nextPages ' ] : [];
160+ }
161+
162+ /**
163+ * Returns an array of images found in the article's content.
164+ *
165+ * Note that this (tries) to ignore content-unrelated images like ads arounds the page, etc.
166+ * The format of the array will be:
167+ *
168+ * [
169+ * {
170+ * "height": 808,
171+ * "diffbotUri": "image|3|-543943368",
172+ * "naturalHeight": 808,
173+ * "width": 717,
174+ * "primary": true,
175+ * "naturalWidth": 717,
176+ * "url": "https://example.com/image1.png"
177+ * },
178+ * {
179+ * "height": 506,
180+ * "diffbotUri": "image|3|-844014913",
181+ * "naturalHeight": 506,
182+ * "width": 715,
183+ * "naturalWidth": 715,
184+ * "url": "https://example.com/image1.jpeg"
185+ * }
186+ * ]
187+ *
188+ * @return array
189+ */
190+ public function getImages ()
191+ {
192+ return (isset ($ this ->objects ['images ' ])) ? $ this ->objects ['images ' ] : [];
193+ }
194+
195+ /**
196+ * Returns an array of videos found in the article's content.
197+ *
198+ * The format of the array will be:
199+ *
200+ * [
201+ * {
202+ * "diffbotUri": "video|3|-1138675744",
203+ * "primary": true,
204+ * "url": "http://player.vimeo.com/video/22439234"
205+ * },
206+ * {
207+ * "diffbotUri": "video|3|-1138675744",
208+ * "primary": true,
209+ * "url": "http://player.vimeo.com/video/22439234"
210+ * }
211+ * ]
212+ *
213+ * @return array
214+ */
215+ public function getVideos () {
216+ return (isset ($ this ->objects ['images ' ])) ? $ this ->objects ['images ' ] : [];
217+ }
218+
219+ /**
220+ * An internal identifier for Diffbot, used for indexing in their databases
221+ * @return string
222+ */
223+ public function getDiffbotUri ()
224+ {
225+ return $ this ->objects ['diffbotUri ' ];
226+ }
227+
228+ public function getLinks () {
229+
230+ }
231+
232+ public function getMeta () {
233+
234+ }
235+
236+ public function getQueryString () {
237+
238+ }
239+ }
0 commit comments